From 7485d5a62685cb745ab50e970adb722cb71557ac Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Wed, 4 Jul 2018 09:34:00 +0100 Subject: COMPMID-970 : Remove QS8 / QS16 support Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 235 +------- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 54 +- .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 54 +- .../kernels/NEBatchNormalizationLayerKernel.cpp | 118 +--- src/core/NEON/kernels/NECol2ImKernel.cpp | 7 +- .../NEConvertFullyConnectedWeightsKernel.cpp | 2 +- .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp | 17 +- .../NEON/kernels/NEDepthConvertLayerKernel.cpp | 187 +----- .../NEDepthwiseConvolutionLayer3x3Kernel.cpp | 2 +- src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp | 1 - .../kernels/NEDepthwiseVectorToTensorKernel.cpp | 1 - .../kernels/NEDepthwiseWeightsReshapeKernel.cpp | 2 - .../NEON/kernels/NEDequantizationLayerKernel.cpp | 2 +- .../kernels/NEDirectConvolutionLayerKernel.cpp | 310 +++------- .../NEDirectConvolutionLayerOutputStageKernel.cpp | 116 +--- src/core/NEON/kernels/NEFillBorderKernel.cpp | 8 +- src/core/NEON/kernels/NEFloorKernel.cpp | 4 +- .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 6 +- .../kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp | 30 +- .../NEON/kernels/NEGEMMMatrixAdditionKernel.cpp | 61 +- .../NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp | 654 +-------------------- .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp | 1 - src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 10 +- src/core/NEON/kernels/NEIm2ColKernel.cpp | 44 +- src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 2 +- src/core/NEON/kernels/NEMinMaxLayerKernel.cpp | 2 +- .../NEON/kernels/NENormalizationLayerKernel.cpp | 150 +---- src/core/NEON/kernels/NEPermuteKernel.cpp | 5 +- .../kernels/NEPixelWiseMultiplicationKernel.cpp | 154 +---- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 563 +----------------- .../NEON/kernels/NEQuantizationLayerKernel.cpp | 2 +- src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp | 4 +- .../NEON/kernels/NEReductionOperationKernel.cpp | 2 +- src/core/NEON/kernels/NEReshapeLayerKernel.cpp | 5 +- src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 166 +----- src/core/NEON/kernels/NETransposeKernel.cpp | 3 +- src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 4 +- 37 files changed, 142 insertions(+), 2846 deletions(-) (limited to 'src/core/NEON') diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index ec125154a4..bdc93ed1b8 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -23,7 +23,6 @@ */ #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" -#include "arm_compute/core/FixedPoint.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEAsymm.h" @@ -46,14 +45,13 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32); // Checks performed when output is configured if((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; @@ -146,36 +144,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat }; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ - // Activation functions : QS8 - static std::map act_map_qs8 = - { - { ActivationFunction::ABS, &NEActivationLayerKernel::activation }, - { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation }, - { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation }, - { ActivationFunction::RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::SQRT, &NEActivationLayerKernel::activation }, - { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation }, - { ActivationFunction::TANH, &NEActivationLayerKernel::activation }, - }; - // Activation functions : QS16 - static std::map act_map_qs16 = - { - { ActivationFunction::ABS, &NEActivationLayerKernel::activation }, - { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation }, - { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation }, - { ActivationFunction::RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation }, - { ActivationFunction::SQRT, &NEActivationLayerKernel::activation }, - { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation }, - { ActivationFunction::TANH, &NEActivationLayerKernel::activation }, - }; // Activation functions : QASYMM8 static std::map act_map_qasymm8 = { @@ -188,12 +156,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat case DataType::QASYMM8: _func = act_map_qasymm8[activation_info.activation()]; break; - case DataType::QS8: - _func = act_map_qs8[activation_info.activation()]; - break; - case DataType::QS16: - _func = act_map_qs16[activation_info.activation()]; - break; case DataType::F32: _func = act_map_f32[activation_info.activation()]; break; @@ -507,70 +469,6 @@ typename std::enable_if::value, void>::type NEActivationL input, output); } -template -typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) -{ - Iterator input(_input, window); - Iterator output(_output, window); - const int fixed_point_position = _input->info()->fixed_point_position(); - - static const qint8x16_t CONST_0 = vdupq_n_qs8(0); - const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position)); - const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position)); - const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const qint8x16_t in = vld1q_qs8(input_ptr); - qint8x16_t tmp = {}; - - switch(F) - { - case ActivationFunction::ABS: - tmp = vqabsq_qs8(in); - break; - case ActivationFunction::LINEAR: - tmp = vqmlaq_qs8(b, a, in, fixed_point_position); - break; - case ActivationFunction::LOGISTIC: - tmp = vqrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position); - break; - case ActivationFunction::RELU: - tmp = vmaxq_qs8(CONST_0, in); - break; - case ActivationFunction::BOUNDED_RELU: - tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in)); - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = vminq_qs8(a, vmaxq_qs8(b, in)); - break; - case ActivationFunction::LEAKY_RELU: - tmp = vbslq_s8(vcgtq_s8(in, CONST_0), in, vmulq_qs8(a, in, fixed_point_position)); - break; - case ActivationFunction::SOFT_RELU: - tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position); - break; - case ActivationFunction::SQRT: - tmp = vqrecipq_qs8(vqinvsqrtq_qs8(in, fixed_point_position), fixed_point_position); - break; - case ActivationFunction::SQUARE: - tmp = vqmulq_qs8(in, in, fixed_point_position); - break; - case ActivationFunction::TANH: - tmp = vqmulq_qs8(a, vqtanhq_qs8(vqmulq_qs8(b, in, fixed_point_position), fixed_point_position), fixed_point_position); - break; - default: - break; - } - - vst1q_qs8(output_ptr, tmp); - }, - input, output); -} - template typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) { @@ -620,137 +518,6 @@ typename std::enable_if::value, void>::type NEActivat input, output); } -template -typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) -{ - Iterator input(_input, window); - Iterator output(_output, window); - const int fixed_point_position = _input->info()->fixed_point_position(); - - static const qint16x8_t CONST_0 = vdupq_n_qs16(0); - const qint16x8_t CONST_1 = vdupq_n_qs16(sqcvt_qs16_f32(1.f, fixed_point_position)); - const qint16x8_t a = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.a(), fixed_point_position)); - const qint16x8_t b = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.b(), fixed_point_position)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - const qint16x8x2_t in = vld2q_s16(input_ptr); - qint16x8x2_t tmp = { {} }; - - switch(F) - { - case ActivationFunction::ABS: - tmp = - { - { - vqabsq_qs16(in.val[0]), - vqabsq_qs16(in.val[1]), - } - }; - break; - case ActivationFunction::LINEAR: - tmp = - { - { - vqmlaq_qs16(b, a, in.val[0], fixed_point_position), - vqmlaq_qs16(b, a, in.val[1], fixed_point_position), - } - }; - break; - case ActivationFunction::LOGISTIC: - tmp = - { - { - vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[0]), fixed_point_position)), fixed_point_position), - vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[1]), fixed_point_position)), fixed_point_position), - } - }; - break; - case ActivationFunction::RELU: - tmp = - { - { - vmaxq_qs16(CONST_0, in.val[0]), - vmaxq_qs16(CONST_0, in.val[1]), - } - }; - break; - case ActivationFunction::BOUNDED_RELU: - tmp = - { - { - vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[0])), - vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[1])), - } - }; - break; - case ActivationFunction::LU_BOUNDED_RELU: - tmp = - { - { - vminq_qs16(a, vmaxq_qs16(b, in.val[0])), - vminq_qs16(a, vmaxq_qs16(b, in.val[1])), - } - }; - break; - case ActivationFunction::LEAKY_RELU: - tmp = - { - { - vbslq_s16(vcgtq_s16(in.val[0], CONST_0), in.val[0], vmulq_qs16(a, in.val[0], fixed_point_position)), - vbslq_s16(vcgtq_s16(in.val[1], CONST_0), in.val[1], vmulq_qs16(a, in.val[1], fixed_point_position)), - } - }; - break; - case ActivationFunction::SOFT_RELU: - tmp = - { - { - vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[0], fixed_point_position)), fixed_point_position), - vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[1], fixed_point_position)), fixed_point_position), - } - }; - break; - case ActivationFunction::SQRT: - tmp = - { - { - vqrecipq_qs16(vqinvsqrtq_qs16(in.val[0], fixed_point_position), fixed_point_position), - vqrecipq_qs16(vqinvsqrtq_qs16(in.val[1], fixed_point_position), fixed_point_position), - } - }; - break; - case ActivationFunction::SQUARE: - tmp = - { - { - vqmulq_qs16(in.val[0], in.val[0], fixed_point_position), - vqmulq_qs16(in.val[1], in.val[1], fixed_point_position), - } - }; - break; - case ActivationFunction::TANH: - tmp = - { - { - vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[0], fixed_point_position), fixed_point_position), fixed_point_position), - vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[1], fixed_point_position), fixed_point_position), fixed_point_position), - } - }; - break; - default: - ARM_COMPUTE_ERROR("Function not implemented"); - break; - } - - vst2q_qs16(output_ptr, tmp); - }, - input, output); -} - Status NEActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index a487090a98..f8e2b6d73e 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -48,38 +48,6 @@ namespace { constexpr unsigned int num_elems_processed_per_iteration = 16; -void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); - const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); - - vst1q_qs8(reinterpret_cast(output.ptr()), vaddq_qs8(a, b)); - }, - input1, input2, output); -} - -void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); - Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape())); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); - const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); - - vst1q_qs8(reinterpret_cast(output.ptr()), vqaddq_qs8(a, b)); - }, - input1, input2, output); -} - void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape())); @@ -362,28 +330,21 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, { ARM_COMPUTE_UNUSED(policy); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(input2.data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2); - } - // Validate in case of configured output if(output.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(input1.data_type() == DataType::QS8 && input2.data_type() == DataType::QS8 && output.data_type() == DataType::QS8) - && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8) + !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8) && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16) && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16) && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16) - && !(input1.data_type() == DataType::QS16 && input2.data_type() == DataType::QS16 && output.data_type() == DataType::QS16) && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16) && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32) && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16), @@ -391,11 +352,6 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0), "Wrong shape for output"); - - if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(output.data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output); - } } return Status{}; @@ -460,8 +416,6 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor static std::map map_function = { - { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 }, - { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 }, { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 }, { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 }, { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 }, @@ -470,8 +424,6 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 }, { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 }, { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 }, - { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 }, - { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 }, { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 }, { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 }, { "add_wrap_F32_F32_F32", &add_F32_F32_F32 }, diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index 3db80285c0..5a162e3b2c 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,38 +45,6 @@ class Coordinates; namespace { -void sub_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window); - Iterator input2(in2, window); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); - const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); - - vst1q_qs8(reinterpret_cast(output.ptr()), vsubq_qs8(a, b)); - }, - input1, input2, output); -} - -void sub_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - Iterator input1(in1, window); - Iterator input2(in2, window); - Iterator output(out, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t a = vld1q_qs8(reinterpret_cast(input1.ptr())); - const qint8x16_t b = vld1q_qs8(reinterpret_cast(input2.ptr())); - - vst1q_qs8(reinterpret_cast(output.ptr()), vqsubq_qs8(a, b)); - }, - input1, input2, output); -} - void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { Iterator input1(in1, window); @@ -353,23 +321,15 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i { ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); - - if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type())) - { - // Check that all data types are the same and all fixed-point positions are the same - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output); - } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG( - !(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8 && output->data_type() == DataType::QS8) - && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8) + !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8) && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16) && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16) - && !(input1->data_type() == DataType::QS16 && input2->data_type() == DataType::QS16 && output->data_type() == DataType::QS16) && !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16) && !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32) && !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16), @@ -432,8 +392,6 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens static std::map map_function = { - { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 }, - { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 }, { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 }, { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 }, { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 }, @@ -442,8 +400,6 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 }, { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 }, { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 }, - { "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 }, - { "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 }, { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 }, { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 }, { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 }, diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 6be50fdb0d..6aed41f3aa 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -43,7 +43,7 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); if(act_info.enabled()) @@ -60,22 +60,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); if(beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); } if(gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); } ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); @@ -103,112 +99,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } //namespace -template -void NEBatchNormalizationLayerKernel::batch_normalization_qs8(const Window &window) -{ - static_assert(!fused_activation, "Activation is not supported for QS8"); - - Iterator input(_input, window); - Iterator output(_output, window); - - // Hold information about the current feature map we are iterating. - // Only compute denominator and NEON vectors once per feature map. - int slice = -1; - - const int fixed_point_position = _input->info()->fixed_point_position(); - const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; - - qint8x16_t mean_vec = vdupq_n_qs8(0); - qint8x16_t var_vec = vdupq_n_qs8(0); - qint8x16_t gamma_vec = vdupq_n_qs8(sqcvt_qs8_f32(1, fixed_point_position)); - qint8x16_t beta_vec = vdupq_n_qs8(sqcvt_qs8_f32(0, fixed_point_position)); - qint8x16_t denominator = vdupq_n_qs8(0); - const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position)); - execute_window_loop(window, [&](const Coordinates & id) - { - if(slice != id.z()) - { - // Conctruct vectors - mean_vec = vdupq_n_qs8(*(input_mean + id.z())); - var_vec = vdupq_n_qs8(*(input_var + id.z())); - if(input_gamma != nullptr) - { - gamma_vec = vdupq_n_qs8(*(input_gamma + id.z())); - } - if(input_beta != nullptr) - { - beta_vec = vdupq_n_qs8(*(input_beta + id.z())); - } - - // Calculate denominator - denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position); - slice = id.z(); - } - - // Calculate x bar and store results - const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast(input.ptr())), mean_vec); - const qint8x16_t x_bar = vqmulq_qs8(numerator, denominator, fixed_point_position); - vst1q_qs8(reinterpret_cast(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position)); - }, - input, output); -} - -template -void NEBatchNormalizationLayerKernel::batch_normalization_qs16(const Window &window) -{ - static_assert(!fused_activation, "Activation is not supported for QS16"); - - Iterator input(_input, window); - Iterator output(_output, window); - - // Hold information about the current feature map we are iterating. - // Only compute denominator and NEON vectors once per feature map. - int slice = -1; - - const int fixed_point_position = _input->info()->fixed_point_position(); - const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; - - qint16x8_t mean_vec = vdupq_n_qs16(0); - qint16x8_t var_vec = vdupq_n_qs16(0); - qint16x8_t gamma_vec = vdupq_n_qs16(sqcvt_qs16_f32(1, fixed_point_position)); - qint16x8_t beta_vec = vdupq_n_qs16(sqcvt_qs16_f32(0, fixed_point_position)); - qint16x8_t denominator = vdupq_n_qs16(0); - const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position)); - execute_window_loop(window, [&](const Coordinates & id) - { - if(slice != id.z()) - { - // Conctruct vectors - mean_vec = vdupq_n_qs16(*(input_mean + id.z())); - var_vec = vdupq_n_qs16(*(input_var + id.z())); - if(input_gamma != nullptr) - { - gamma_vec = vdupq_n_qs16(*(input_gamma + id.z())); - } - if(input_beta != nullptr) - { - beta_vec = vdupq_n_qs16(*(input_beta + id.z())); - } - - // Calculate denominator - denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position); - slice = id.z(); - } - - // Calculate x bar and store results - const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast(input.ptr())), mean_vec); - const qint16x8_t x_bar = vqmulq_qs16(numerator, denominator, fixed_point_position); - vst1q_qs16(reinterpret_cast(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position)); - }, - input, output); -} - template void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window) { @@ -406,12 +296,6 @@ void NEBatchNormalizationLayerKernel::configure_non_fused() const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC; switch(_input->info()->data_type()) { - case DataType::QS8: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs8; - break; - case DataType::QS16: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16; - break; case DataType::F16: _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw; break; diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp index 9fda65feb4..d09d174e4f 100644 --- a/src/core/NEON/kernels/NECol2ImKernel.cpp +++ b/src/core/NEON/kernels/NECol2ImKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,8 +50,8 @@ TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_d Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, - DataType::U16, DataType::S16, DataType::QS16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); @@ -60,7 +60,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp index b3746bddf2..e581f221a3 100644 --- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp +++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp @@ -65,7 +65,7 @@ void NEConvertFullyConnectedWeightsKernel::configure(const ITensor *input, ITens Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::QS32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp index 891a03c5cc..38443ca4a8 100644 --- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -41,10 +41,6 @@ using namespace arm_compute; namespace { // Overloads of 128-bit vector loads -uint8x16_t loadq(const uint8_t *ptr) -{ - return vld1q_u8(ptr); -} uint16x8_t loadq(const uint16_t *ptr) { return vld1q_u16(ptr); @@ -54,10 +50,6 @@ uint32x4_t loadq(const uint32_t *ptr) return vld1q_u32(ptr); } // Overloads of 128-bit vector stores -void storeq(uint8_t *ptr, uint8x16_t val) -{ - return vst1q_u8(ptr, val); -} void storeq(uint16_t *ptr, uint16x8_t val) { return vst1q_u16(ptr, val); @@ -107,9 +99,8 @@ BorderSize NEDepthConcatenateLayerKernel::border_size() const void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); @@ -129,10 +120,6 @@ void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int switch(input->info()->data_type()) { - case DataType::QS8: - _func = &depth_concat; - break; - case DataType::QS16: case DataType::F16: _func = &depth_concat; break; diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp index c29cb57513..8280b52fcb 100644 --- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,13 +40,13 @@ class Coordinates; } // namespace arm_compute NEDepthConvertLayerKernel::NEDepthConvertLayerKernel() - : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0) + : _input(nullptr), _output(nullptr), _policy(), _shift(0) { } void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16); _input = input; _output = input; @@ -58,48 +58,26 @@ void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, Conve // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) set_shape_if_empty(*output->info(), input->info()->tensor_shape()); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); // Set output _output = output; } - // Set initial fixed point position of input and output - _fixed_point_position_input = input->info()->fixed_point_position(); - _fixed_point_position_output = _output->info()->fixed_point_position(); - - // Set the fixed point position to the output tensor if needed - if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type())) - { - // If in-place set the fixed point position of the output tensor to be equal to shift - _fixed_point_position_output = (_input == _output) ? static_cast(_shift) : _fixed_point_position_output; - // Set fixed point position to output tensor - _output->info()->set_fixed_point_position(_fixed_point_position_output); - } - - ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type()))); + ARM_COMPUTE_ERROR_ON(shift >= 8); ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type()))); ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S32), "Only data_types supported [in] U8 -> [out] U16, S16, S32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32), - "Only data_types supported [in] QS8 -> [out] QS8, F32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32), - "Only data_types supported [in] QS16 -> [out] QS16, F32"); - - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16), - "Only data_types supported [in] F32 -> [out] QS8, QS16"); - constexpr unsigned int num_elems_processed_per_iteration = 16; // Configure kernel window @@ -132,8 +110,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info Iterator input(_input, window); Iterator output(_output, window); - bool in_place = (_input == _output); - switch(_input->info()->data_type()) { case DataType::U8: @@ -212,49 +188,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info } break; } - case DataType::QS8: - { - switch(_output->info()->data_type()) - { - case DataType::QS8: - { - const int relative_shift = _fixed_point_position_output - _fixed_point_position_input; - /* Fixed point position conversion QS8 -> QS8 */ - if(relative_shift != 0 || !in_place) - { - const auto relative_shift_vec = vdupq_n_qs8(relative_shift); - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast(input.ptr())); - vst1q_qs8(reinterpret_cast(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec)); - }, - input, output); - } - break; - } - case DataType::F32: - { - /* Up-conversion QS8 -> F32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast(input.ptr())); - - float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input); - float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input); - - vst1q_f32(reinterpret_cast(output.ptr()), texels_low.val[0]); - vst1q_f32(reinterpret_cast(output.ptr()) + 4, texels_low.val[1]); - vst1q_f32(reinterpret_cast(output.ptr()) + 8, texels_high.val[0]); - vst1q_f32(reinterpret_cast(output.ptr()) + 12, texels_high.val[1]); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } case DataType::S16: { switch(_output->info()->data_type()) @@ -408,116 +341,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info } break; } - case DataType::QS16: - { - switch(_output->info()->data_type()) - { - case DataType::QS16: - { - const int relative_shift = _fixed_point_position_output - _fixed_point_position_input; - /* Fixed point position conversion QS16 -> QS16 */ - if(relative_shift != 0 || !in_place) - { - const auto relative_shift_vec = vdupq_n_qs16(relative_shift); - execute_window_loop(window, [&](const Coordinates & id) - { - const qint16x8x2_t texels_qs16 = - { - { - vld1q_qs16(reinterpret_cast(input.ptr())), - vld1q_qs16(reinterpret_cast(input.ptr()) + 8) - } - }; - vst1q_qs16(reinterpret_cast(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec)); - vst1q_qs16(reinterpret_cast(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec)); - }, - input, output); - } - break; - } - case DataType::F32: - { - /* Up-conversion QS16 -> F32 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const int16x8x2_t texels_qs16 = - { - { - vld1q_s16(reinterpret_cast(input.ptr())), - vld1q_s16(reinterpret_cast(input.ptr()) + 8) - } - }; - - vst1q_f32(reinterpret_cast(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input)); - vst1q_f32(reinterpret_cast(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input)); - vst1q_f32(reinterpret_cast(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input)); - vst1q_f32(reinterpret_cast(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input)); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::F32: - { - switch(_output->info()->data_type()) - { - case DataType::QS8: - { - /* Down-conversion F32 -> QS8 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x4_t texels_f32 = - { - { - vld1q_f32(reinterpret_cast(input.ptr())), - vld1q_f32(reinterpret_cast(input.ptr()) + 4), - vld1q_f32(reinterpret_cast(input.ptr()) + 8), - vld1q_f32(reinterpret_cast(input.ptr()) + 12) - } - }; - - const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output); - - vst1q_s8(reinterpret_cast(output.ptr()), texels_s8); - }, - input, output); - break; - } - case DataType::QS16: - { - /* Down-conversion F32 -> QS16 */ - execute_window_loop(window, [&](const Coordinates & id) - { - const float32x4x2_t texels_f32_1 = - { - { - vld1q_f32(reinterpret_cast(input.ptr())), - vld1q_f32(reinterpret_cast(input.ptr()) + 4), - } - }; - const float32x4x2_t texels_f32_2 = - { - { - vld1q_f32(reinterpret_cast(input.ptr()) + 8), - vld1q_f32(reinterpret_cast(input.ptr()) + 12) - } - }; - - vst1q_s16(reinterpret_cast(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output)); - vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output)); - }, - input, output); - break; - } - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } default: ARM_COMPUTE_ERROR("Not supported"); } diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp index 8cdf175d8a..09728e2a8d 100644 --- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp @@ -115,7 +115,7 @@ public: in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0, input_offset); + auto vres = convolve_3x3(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset); store_results(p_out, vres); } } diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp index cfd8eacfdd..5b43e2b14f 100644 --- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp @@ -122,7 +122,6 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias); ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0))); diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp index 8960d8a8af..86a6d1c1a8 100644 --- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp @@ -89,7 +89,6 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); _input = input; _output = output; diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp index 36b17bfc4c..47fcf12874 100644 --- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp @@ -88,7 +88,6 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1)); ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0))); @@ -96,7 +95,6 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o if(biases != nullptr) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases); ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1); } diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp index 4120e5f87a..47c895c594 100644 --- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp @@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max) { // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0); + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32); constexpr unsigned int num_elems_processed_per_iteration = 8; diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 5eafdf0363..54a046846a 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -43,34 +43,6 @@ using namespace arm_compute::detail; namespace { -template -qint16x8_t internal_vld1q(const qint16_t *in); - -template <> -qint16x8_t internal_vld1q<1>(const qint16_t *in) -{ - return vld1q_qs16(in); -} - -template <> -qint16x8_t internal_vld1q<2>(const qint16_t *in) -{ - const int16x8x2_t tmp = vld2q_s16(in); - return tmp.val[0]; -} - -template <> -qint16x8_t internal_vld1q<3>(const qint16_t *in) -{ - const int16x8x3_t tmp = vld3q_s16(in); - return tmp.val[0]; -} - -inline qint16x8_t internal_vdupq_n(qint16_t v) -{ - return vdupq_n_qs16(v); -} - #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template float16x8_t internal_vld1q(const float16_t *in); @@ -105,15 +77,13 @@ inline void internal_vst1q(float16_t *p, const float16x8_t &v) vst1q_f16(p, v); } -float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y, int fixed_point_position) +float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y) { - ARM_COMPUTE_UNUSED(fixed_point_position); return vmulq_f16(x, y); } -inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z, int fixed_point_position) +inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z) { - ARM_COMPUTE_UNUSED(fixed_point_position); return vaddq_f16(x, vmulq_f16(y, z)); } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ @@ -151,107 +121,16 @@ inline void internal_vst1q(float *p, const float32x4_t &v) vst1q_f32(p, v); } -float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position) +float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y) { - ARM_COMPUTE_UNUSED(fixed_point_position); return vmulq_f32(x, y); } -inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position) +inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z) { - ARM_COMPUTE_UNUSED(fixed_point_position); return vmlaq_f32(x, y, z); } -template -qint8x8_t internal_vld1q(const qint8_t *in); - -template <> -qint8x8_t internal_vld1q<1>(const qint8_t *in) -{ - return vld1_qs8(in); -} - -template <> -qint8x8_t internal_vld1q<2>(const qint8_t *in) -{ - const qint8x8x2_t tmp = vld2_s8(in); - return tmp.val[0]; -} - -template <> -qint8x8_t internal_vld1q<3>(const qint8_t *in) -{ - const qint8x8x3_t tmp = vld3_s8(in); - return tmp.val[0]; -} - -inline qint8x8_t internal_vdupq_n(qint8_t v) -{ - return vdup_n_qs8(v); -} - -inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position) -{ - return vmull_qs8(x, y, fixed_point_position); -} - -inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position) -{ - return vqmlal_qs8(x, y, z, fixed_point_position); -} - -inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) -{ - vst1q_qs16(p, v); -} - -inline void internal_vst1q(int32_t *p, const qint32x4x2_t &v) -{ - vst1q_s32(p, v.val[0]); - vst1q_s32(p + 4, v.val[1]); -} - -template -qint32x4x2_t internal_vld1q(const qint32_t *in); - -template <> -qint32x4x2_t internal_vld1q<1>(const qint32_t *in) -{ - const qint32x4x2_t r = - { - { - vld1q_s32(in), - vld1q_s32(in + 4) - } - }; - return r; -} - -inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position) -{ - const qint32x4x2_t r = - { - { - vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position), - vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position), - } - }; - return r; -} - -inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position) -{ - const qint32x4x2_t r = - { - { - vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position), - vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position) - } - }; - return r; -} - constexpr int small_tensor_size_optim = 8; inline bool run_optim_small_tensor_info(const ITensorInfo *t) { @@ -355,21 +234,20 @@ public: static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) { - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); - const int range_z = window.z().end() - window.z().start(); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - const int fixed_point_position = input->info()->fixed_point_position(); + const int input_stride_x = input->info()->strides_in_bytes().x(); + const int input_stride_y = input->info()->strides_in_bytes().y(); + const int input_stride_z = input->info()->strides_in_bytes().z(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int output_stride_z = output->info()->strides_in_bytes().z(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = output->info()->dimension(0); + const int output_h = output->info()->dimension(1); + const int range_z = window.z().end() - window.z().start(); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); // setup output window for the iterator Window window_out = window; @@ -414,7 +292,7 @@ public: auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) { - internal_vst1q(p_out, internal_vmull(vk, internal_vld1q(in_val), fixed_point_position)); + internal_vst1q(p_out, internal_vmull(vk, internal_vld1q(in_val))); } } } @@ -431,7 +309,7 @@ public: auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) { - internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q(in_val), fixed_point_position)); + internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q(in_val))); } } } @@ -469,7 +347,7 @@ void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) template float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position); + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4); inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2) { @@ -511,9 +389,8 @@ inline float32x4x3_t load_input(const float *const in) template <> inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position) + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) { - ARM_COMPUTE_UNUSED(fixed_point_position); const float32x4x3_t vin0 = load_input(in_0); const float32x4x3_t vin1 = load_input(in_1); const float32x4x3_t vin2 = load_input(in_2); @@ -601,10 +478,9 @@ inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const template <> inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position) + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) { - ARM_COMPUTE_UNUSED(fixed_point_position); - float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position); + float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); @@ -613,9 +489,9 @@ inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const template <> inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, - const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position) + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) { - float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position); + float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); return out; } @@ -642,28 +518,6 @@ void accumulate_results<3>(float *buffer, const float32x4x2_t &values) vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); } -template -void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values); - -template <> -void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0])); - vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1])); -} - -template <> -void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0])); -} - -template <> -void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values) -{ - vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0]))); -} - template class convolver_nhwc { @@ -745,7 +599,7 @@ public: const auto we_addr = reinterpret_cast(we_addr_base1 + x * kernel_stride_x); const auto we_values = internal_vld1q<1>(we_addr); - out_values = internal_vmlal(out_values, in_values, we_values, 0); + out_values = internal_vmlal(out_values, in_values, we_values); } out_val += out_values[0]; @@ -784,24 +638,23 @@ public: const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); - const int kernel_stride_x = weights->info()->strides_in_bytes().x(); - const int kernel_stride_y = weights->info()->strides_in_bytes().y(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); - const int num_planes_z = window.z().end() - window.z().start(); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - const int fixed_point_position = input->info()->fixed_point_position(); + const int input_stride_x = input->info()->strides_in_bytes().x(); + const int input_stride_y = input->info()->strides_in_bytes().y(); + const int input_stride_z = input->info()->strides_in_bytes().z(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int output_stride_z = output->info()->strides_in_bytes().z(); + const int kernel_stride_x = weights->info()->strides_in_bytes().x(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = output->info()->dimension(0); + const int output_h = output->info()->dimension(1); + const int num_planes_z = window.z().end() - window.z().start(); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); // setup output window for the iterator Window window_out = window; @@ -864,7 +717,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position); + auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2); store_results(p_out, vres); } } @@ -889,7 +742,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position); + auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2); accumulate_results(p_out, vres); } } @@ -908,24 +761,23 @@ public: const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); - const int input_stride_x = input->info()->strides_in_bytes().x(); - const int input_stride_y = input->info()->strides_in_bytes().y(); - const int input_stride_z = input->info()->strides_in_bytes().z(); - const int output_stride_y = output->info()->strides_in_bytes().y(); - const int output_stride_z = output->info()->strides_in_bytes().z(); - const int kernel_stride_x = weights->info()->strides_in_bytes().x(); - const int kernel_stride_y = weights->info()->strides_in_bytes().y(); - const int kernel_stride_z = weights->info()->strides_in_bytes().z(); - const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; - const int output_w = output->info()->dimension(0); - const int output_h = output->info()->dimension(1); - const int num_planes_z = window.z().end() - window.z().start(); - const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); - const int kernel_depth = weights->info()->dimension(Window::DimZ); - const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); - const unsigned int conv_pad_left = conv_info.pad_left(); - const unsigned int conv_pad_top = conv_info.pad_top(); - const int fixed_point_position = input->info()->fixed_point_position(); + const int input_stride_x = input->info()->strides_in_bytes().x(); + const int input_stride_y = input->info()->strides_in_bytes().y(); + const int input_stride_z = input->info()->strides_in_bytes().z(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int output_stride_z = output->info()->strides_in_bytes().z(); + const int kernel_stride_x = weights->info()->strides_in_bytes().x(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = output->info()->dimension(0); + const int output_h = output->info()->dimension(1); + const int num_planes_z = window.z().end() - window.z().start(); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); // setup output window for the iterator Window window_out = window; @@ -976,7 +828,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_5x5(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position); + auto vres = convolve_5x5(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4); store_results(p_out, vres); } } @@ -1001,7 +853,7 @@ public: for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration) { - auto vres = convolve_5x5(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position); + auto vres = convolve_5x5(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4); accumulate_results(p_out, vres); } } @@ -1120,7 +972,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); const DataLayout data_layout = input->data_layout(); @@ -1140,11 +992,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info); DataType data_type = input->data_type(); - if(is_data_type_fixed_point(data_type)) - { - // Promote data type in case of fixed point - data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); - } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type); @@ -1180,11 +1027,9 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QS8: - case DataType::QS16: num_elems_written_per_iteration = 8; break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: if(run_optim_small_tensor_info(input)) { @@ -1215,13 +1060,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QS8: - case DataType::QS16: num_weight_elems_read_per_row = 8 + kernel_size - 1; num_elems_read_per_iteration = 24; num_elems_written_per_iteration = 32 >> conv_stride_x; break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: ARM_COMPUTE_ERROR("Data type not supported."); break; @@ -1315,14 +1158,8 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens DataType data_type = input->info()->data_type(); - if(is_data_type_fixed_point(data_type)) - { - // Promote data type in case of fixed point - data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); - } - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), output_shape, 1, data_type); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info)); @@ -1371,12 +1208,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo { switch(_input->info()->data_type()) { - case DataType::QS8: - convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); - break; - case DataType::QS16: - convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); - break; case DataType::F32: convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); break; @@ -1395,9 +1226,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo { switch(_input->info()->data_type()) { - case DataType::QS8: - convolve_3x3(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); - break; case DataType::F32: convolve_3x3(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); break; diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 37a3804289..e4cd4d0465 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -45,22 +45,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, - DataType::QS16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, + DataType::F16, DataType::QS32, DataType::S32, DataType::F32); if(bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::QS32, DataType::S32, DataType::F32); - if(is_data_type_fixed_point(input->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias); - } - else if(is_data_type_quantized_asymmetric(input->data_type())) + if(is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -80,17 +73,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con // Checks performed when output is configured if((output != nullptr) && (output->total_size() != 0)) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - if(is_data_type_fixed_point(input->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && output->data_type() != DataType::QS8, "Wrong data type for output"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && output->data_type() != DataType::QS16, "Wrong data type for output"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); - } - else if(is_data_type_quantized_asymmetric(output->data_type())) + if(is_data_type_quantized_asymmetric(output->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias"); } @@ -168,81 +154,24 @@ inline float32x4_t internal_vld1q(const float *in) { return vld1q_f32(in); } -inline qint8x16_t internal_vld1q(const qint8_t *in) -{ - return vld1q_qs8(in); -} -inline qint16x8_t internal_vld1q(const qint16_t *in) -{ - return vld1q_qs16(in); -} -inline qint32x4_t internal_vld1q(const qint32_t *in) -{ - return vld1q_s32(in); -} // Internal store inline void internal_vst1q(float *p, const float32x4_t &v) { vst1q_f32(p, v); } -inline void internal_vst1q(qint8_t *p, const qint8x16_t &v) -{ - vst1q_qs8(p, v); -} -inline void internal_vst1q(qint8_t *p, const qint16x8_t &v) -{ - vst1_qs8(p, vqmovn_s16(v)); -} -inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) -{ - vst1q_qs16(p, v); -} -inline void internal_vst1q(qint32_t *p, const qint32x4_t &v) -{ - vst1q_s32(p, v); -} - -inline void internal_vst1q(qint16_t *p, const qint32x4_t &v) -{ - vst1_qs16(p, vqmovn_qs32(v)); -} // Internal vdup inline float32x4_t internal_vdupq_n(float v) { return vdupq_n_f32(v); } -inline qint8x16_t internal_vdupq_n(qint8_t v) -{ - return vdupq_n_qs8(v); -} -inline qint16x8_t internal_vdupq_n(qint16_t v) -{ - return vdupq_n_qs16(v); -} -inline qint32x4_t internal_vdupq_n(qint32_t v) -{ - return vdupq_n_qs32(v); -} // Internal vadd inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y) { return vaddq_f32(x, y); } -inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y) -{ - return vqaddq_qs8(x, y); -} -inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y) -{ - return vqaddq_qs16(x, y); -} -inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y) -{ - return vqaddq_qs32(x, y); -} #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC inline float16x8_t internal_vld1q(const float16_t *in) @@ -494,39 +423,6 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const { switch(input->info()->data_type()) { - case DataType::QS8: - { - if(bias == nullptr) - { - _func = (output == nullptr) ? &output_stage : &output_stage; - } - else - { - _func = (output == nullptr) ? &output_stage : &output_stage; - } - break; - } - case DataType::QS16: - { - if(bias != nullptr && bias->info()->data_type() == DataType::QS8) - { - _func = (output == nullptr) ? &output_stage : &output_stage; - } - else if(bias == nullptr) - { - _func = (output == nullptr) ? &output_stage : &output_stage; - } - else - { - ARM_COMPUTE_ERROR("Not implemented"); - } - break; - } - case DataType::QS32: - { - _func = (output == nullptr) ? &output_stage : &output_stage; - break; - } case DataType::S32: { _func = (bias == nullptr) ? &output_stage : &output_stage; diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 747b8b1bfe..3d08cafa93 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -105,8 +105,8 @@ NEFillBorderKernel::NEFillBorderKernel() void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QASYMM8, - DataType::QS16, DataType::U16, DataType::S16, + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QASYMM8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); @@ -147,7 +147,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) case DataType::U8: fill_constant_value_single_channel(window); break; - case DataType::QS8: case DataType::S8: fill_constant_value_single_channel(window); break; @@ -155,7 +154,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) fill_constant_value_single_channel(window); break; case DataType::S16: - case DataType::QS16: fill_constant_value_single_channel(window); break; case DataType::U32: @@ -192,7 +190,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) case DataType::U8: fill_replicate_single_channel(window); break; - case DataType::QS8: case DataType::S8: fill_replicate_single_channel(window); break; @@ -200,7 +197,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) fill_replicate_single_channel(window); break; case DataType::S16: - case DataType::QS16: fill_replicate_single_channel(window); break; case DataType::U32: diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp index 72b652d5dc..872ac2661e 100644 --- a/src/core/NEON/kernels/NEFloorKernel.cpp +++ b/src/core/NEON/kernels/NEFloorKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -40,7 +40,7 @@ void NEFloorKernel::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Auto initialize output - auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type()); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index 12755a45f8..6519a39b9c 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -44,11 +44,10 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, - DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); if(output->total_size() != 0) { @@ -57,7 +56,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) output_shape.set(1, std::ceil(input->dimension(1) / 4.0f)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp index cab3c7a58f..421a6f0ef9 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp @@ -43,9 +43,8 @@ namespace { inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0)); @@ -161,33 +160,6 @@ void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadI break; } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QS8: - { - execute_window_loop(window, [&](const Coordinates & id) - { - const qint8x16_t accum = vld1q_qs8(reinterpret_cast(in0_out.ptr())); - const qint8x16_t biases = vld1q_qs8(reinterpret_cast(in1.ptr())); - - vst1q_qs8(reinterpret_cast(in0_out.ptr()), vqaddq_qs8(accum, biases)); - }, - in0_out, in1); - break; - } - case DataType::QS16: - { - execute_window_loop(window, [&](const Coordinates & id) - { - qint16x8x2_t accum = vld2q_s16(reinterpret_cast(in0_out.ptr())); - const qint16x8x2_t biases = vld2q_s16(reinterpret_cast(in1.ptr())); - - accum.val[0] = vqaddq_qs16(accum.val[0], biases.val[0]); - accum.val[1] = vqaddq_qs16(accum.val[1], biases.val[1]); - - vst2q_s16(reinterpret_cast(in0_out.ptr()), accum); - }, - in0_out, in1); - break; - } default: ARM_COMPUTE_ERROR("Data type not supported"); break; diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp index dfba74355b..d02504329a 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -91,54 +91,6 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta) -{ - const int fixed_point_position = input->info()->fixed_point_position(); - const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position)); - - Iterator in(input, window); - Iterator out(output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - qint8x16_t alpha_ab = vld1q_qs8(out_ptr); - const qint8x16_t c = vld1q_qs8(in_ptr); - - // Multiply matrix C by its weight and accumulate - alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position); - - vst1q_qs8(out_ptr, alpha_ab); - }, - in, out); -} - -void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta) -{ - const int fixed_point_position = input->info()->fixed_point_position(); - const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position)); - - Iterator in(input, window); - Iterator out(output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(in.ptr()); - const auto out_ptr = reinterpret_cast(out.ptr()); - - qint16x8x2_t alpha_ab = vld2q_s16(out_ptr); - const qint16x8x2_t c = vld2q_s16(in_ptr); - - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vqmlaq_qs16(alpha_ab.val[0], c.val[0], beta_qs16, fixed_point_position); - alpha_ab.val[1] = vqmlaq_qs16(alpha_ab.val[1], c.val[1], beta_qs16, fixed_point_position); - - vst2q_s16(out_ptr, alpha_ab); - }, - in, out); -} } // namespace NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel() @@ -148,10 +100,9 @@ NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel() void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); @@ -160,12 +111,6 @@ void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output case DataType::F32: _func = &matrix_addition_f32; break; - case DataType::QS8: - _func = &matrix_addition_qs8; - break; - case DataType::QS16: - _func = &matrix_addition_qs16; - break; case DataType::F16: #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC _func = &matrix_addition_f16; diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp index 69b052a9bd..196398a2de 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp @@ -355,263 +355,6 @@ void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT ina, inb, out); } -template -void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha) -{ - const auto width_matrix_b = static_cast(output->info()->dimension(0)); - const auto in_b_stride = static_cast(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type())); - const auto num_elems_vec_a = static_cast(input0->info()->dimension(0)); - const int fixed_point_position = input0->info()->fixed_point_position(); - - // The implementation computes 32 elements per iteration - const int window_start_x = 32 * info.thread_id; - const int window_step_x = 32 * info.num_threads; - // Make sure (window_end_x - window_start_x) is a multiple of window_step_x - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(input1->info()->num_dimensions() >= 3) - { - win_b = window; - } - win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator ina(input0, win_a); - Iterator inb(input1, win_b); - Iterator out(output, win_out); - - execute_window_loop(win_out, [&](const Coordinates & id) - { - if(id.x() > width_matrix_b) - { - return; - } - - // Reset accumulators - qint16x8_t acc00_qs16 = vdupq_n_qs16(0); - qint16x8_t acc01_qs16 = vdupq_n_qs16(0); - qint16x8_t acc02_qs16 = vdupq_n_qs16(0); - qint16x8_t acc03_qs16 = vdupq_n_qs16(0); - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()); - - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 2);) - { - const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0); - const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1); - - const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride); - const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride); - const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride); - const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride); - const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride); - const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride); - const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride); - const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride); - - // First accumulation - acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); - acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); - acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position); - acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position); - - // Second accumulation - acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position); - acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position); - acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position); - acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position); - - vec_a += 2; - matrix_b += 2 * in_b_stride; - } - - for(; vec_a < vec_a_end_addr;) - { - const qint8x8_t a0 = vld1_dup_qs8(vec_a); - - const qint8x8_t b00 = vld1_qs8(matrix_b + 0); - const qint8x8_t b01 = vld1_qs8(matrix_b + 8); - const qint8x8_t b02 = vld1_qs8(matrix_b + 16); - const qint8x8_t b03 = vld1_qs8(matrix_b + 24); - - acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); - acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); - acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position); - acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position); - - vec_a += 1; - matrix_b += in_b_stride; - } - - // Convert back to qint8x8_t and saturate - qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16); - qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16); - qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16); - qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16); - - // Multiply by the weight of the matrix product (alpha) - if(multiply_alpha) - { - const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position)); - acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position); - acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position); - acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position); - acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position); - } - - const auto mtx_out0 = reinterpret_cast(out.ptr()); - - // Store 8x4 output elements - vst1_qs8(mtx_out0 + 0, acc00_qs8); - vst1_qs8(mtx_out0 + 8, acc01_qs8); - vst1_qs8(mtx_out0 + 16, acc02_qs8); - vst1_qs8(mtx_out0 + 24, acc03_qs8); - }, - ina, inb, out); -} - -template -void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha) -{ - const auto width_matrix_b = static_cast(output->info()->dimension(0)); - const auto in_b_stride = static_cast(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type())); - const auto num_elems_vec_a = static_cast(input0->info()->dimension(0)); - const int fixed_point_position = input0->info()->fixed_point_position(); - - // The implementation computes 16 elements per iteration - const int window_start_x = 16 * info.thread_id; - const int window_step_x = 16 * info.num_threads; - // Make sure (window_end_x - window_start_x) is a multiple of window_step_x - const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x"); - - Window win_out(window); - win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(input1->info()->num_dimensions() >= 3) - { - win_b = window; - } - win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); - win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - - Iterator ina(input0, win_a); - Iterator inb(input1, win_b); - Iterator out(output, win_out); - - execute_window_loop(win_out, [&](const Coordinates & id) - { - if(id.x() > width_matrix_b) - { - return; - } - - // Reset accumulators - qint32x4_t acc00_qs32 = vdupq_n_qs32(0); - qint32x4_t acc01_qs32 = vdupq_n_qs32(0); - qint32x4_t acc02_qs32 = vdupq_n_qs32(0); - qint32x4_t acc03_qs32 = vdupq_n_qs32(0); - - auto vec_a = reinterpret_cast(ina.ptr()); - auto matrix_b = reinterpret_cast(inb.ptr()); - - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 2);) - { - const qint16x4_t a0 = vld1_dup_qs16(vec_a + 0); - const qint16x4_t a1 = vld1_dup_qs16(vec_a + 1); - - const qint16x4_t b00 = vld1_qs16(matrix_b + 0 + 0 * in_b_stride); - const qint16x4_t b01 = vld1_qs16(matrix_b + 4 + 0 * in_b_stride); - const qint16x4_t b02 = vld1_qs16(matrix_b + 8 + 0 * in_b_stride); - const qint16x4_t b03 = vld1_qs16(matrix_b + 12 + 0 * in_b_stride); - const qint16x4_t b10 = vld1_qs16(matrix_b + 0 + 1 * in_b_stride); - const qint16x4_t b11 = vld1_qs16(matrix_b + 4 + 1 * in_b_stride); - const qint16x4_t b12 = vld1_qs16(matrix_b + 8 + 1 * in_b_stride); - const qint16x4_t b13 = vld1_qs16(matrix_b + 12 + 1 * in_b_stride); - - // First accumulation - acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position); - acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position); - acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position); - acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position); - - // Second accumulation - acc00_qs32 = vqmlal_qs16(acc00_qs32, b10, a1, fixed_point_position); - acc01_qs32 = vqmlal_qs16(acc01_qs32, b11, a1, fixed_point_position); - acc02_qs32 = vqmlal_qs16(acc02_qs32, b12, a1, fixed_point_position); - acc03_qs32 = vqmlal_qs16(acc03_qs32, b13, a1, fixed_point_position); - - vec_a += 2; - matrix_b += 2 * in_b_stride; - } - - for(; vec_a < vec_a_end_addr;) - { - const qint16x4_t a0 = vld1_dup_qs16(vec_a); - - const qint16x4_t b00 = vld1_qs16(matrix_b + 0); - const qint16x4_t b01 = vld1_qs16(matrix_b + 4); - const qint16x4_t b02 = vld1_qs16(matrix_b + 8); - const qint16x4_t b03 = vld1_qs16(matrix_b + 12); - - acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position); - acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position); - acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position); - acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position); - - vec_a += 1; - matrix_b += in_b_stride; - } - - // Convert back to qint16x4_t and saturate - qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32); - qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32); - qint16x4_t acc02_qs16 = vqmovn_qs32(acc02_qs32); - qint16x4_t acc03_qs16 = vqmovn_qs32(acc03_qs32); - - // Multiply by the weight of the matrix product (alpha) - if(multiply_alpha) - { - const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position)); - acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position); - acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position); - acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position); - acc03_qs16 = vqmul_qs16(acc03_qs16, alpha_qs16, fixed_point_position); - } - - const auto mtx_out0 = reinterpret_cast(out.ptr()); - - // Store 16x4 output elements - vst1_qs16(mtx_out0 + 0, acc00_qs16); - vst1_qs16(mtx_out0 + 4, acc01_qs16); - vst1_qs16(mtx_out0 + 8, acc02_qs16); - vst1_qs16(mtx_out0 + 12, acc03_qs16); - }, - ina, inb, out); -} - template void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) { @@ -1063,361 +806,12 @@ void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } -template -void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) -{ - const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()); - const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type()); - const size_t out_stride2 = out_stride1 * 2; - const size_t out_stride3 = out_stride1 * 3; - const int num_elems_matrix_b_x = input1->info()->dimension(0); - const int fixed_point_position = input0->info()->fixed_point_position(); - const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position)); - ARM_COMPUTE_UNUSED(alpha_qs8); - - // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(input1->info()->num_dimensions() >= 3) - { - win_b = window; - } - // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix - // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4 - win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride)); - win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator ina(input0, win_a); - Iterator inb(input1, win_b); - Iterator out(output, window); - - // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW - // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration - // All the values needed for computing a single 32x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto mtx_a0 = reinterpret_cast(ina.ptr()); - auto mtx_b0 = reinterpret_cast(inb.ptr()); - auto mtx_b1 = mtx_b0 + in_b_stride; - - qint16x8_t acc00_qs16 = vdupq_n_qs16(0); - qint16x8_t acc10_qs16 = vdupq_n_qs16(0); - qint16x8_t acc20_qs16 = vdupq_n_qs16(0); - qint16x8_t acc30_qs16 = vdupq_n_qs16(0); - - qint16x8_t acc01_qs16 = vdupq_n_qs16(0); - qint16x8_t acc11_qs16 = vdupq_n_qs16(0); - qint16x8_t acc21_qs16 = vdupq_n_qs16(0); - qint16x8_t acc31_qs16 = vdupq_n_qs16(0); - - qint16x8_t acc02_qs16 = vdupq_n_qs16(0); - qint16x8_t acc12_qs16 = vdupq_n_qs16(0); - qint16x8_t acc22_qs16 = vdupq_n_qs16(0); - qint16x8_t acc32_qs16 = vdupq_n_qs16(0); - - qint16x8_t acc03_qs16 = vdupq_n_qs16(0); - qint16x8_t acc13_qs16 = vdupq_n_qs16(0); - qint16x8_t acc23_qs16 = vdupq_n_qs16(0); - qint16x8_t acc33_qs16 = vdupq_n_qs16(0); - - int k = 0; - // This for loop performs 2 accumulations - for(; k <= (num_elems_matrix_b_x - 32); k += 32) - { - const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0); - const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1); - const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2); - const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3); - const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4); - const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5); - const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6); - const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7); - - const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0); - const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8); - const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0); - const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8); - - // First accumulation - acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); - acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position); - acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position); - acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position); - acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position); - acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position); - acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position); - acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position); - - const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16); - const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24); - const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16); - const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24); - - acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); - acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position); - acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position); - acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position); - acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position); - acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position); - acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position); - acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position); - -#if __arm__ - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_a0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b1))); -#endif /* __arm__ */ - - // Second accumulation - acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position); - acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position); - acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position); - acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position); - acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position); - acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position); - acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position); - acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position); - acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position); - acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position); - acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position); - acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position); - acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position); - acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position); - acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position); - acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position); - - mtx_a0 += 8; - mtx_b0 += 32; - mtx_b1 += 32; - } - - // This for loop performs the left over accumulations - for(; k < num_elems_matrix_b_x; k += 16) - { - const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0); - const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1); - const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2); - const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3); - - const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0); - const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8); - const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0); - const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8); - - acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); - acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position); - acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position); - acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position); - acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); - acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position); - acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position); - acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position); - acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position); - acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position); - acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position); - acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position); - acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position); - acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position); - acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position); - acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position); - - mtx_a0 += 4; - mtx_b0 += 16; - mtx_b1 += 16; - } - - // Convert back to qint8x8_t and saturate - qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16); - qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16); - qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16); - qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16); - - qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16); - qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16); - qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16); - qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16); - - qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16); - qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16); - qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16); - qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16); - - qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16); - qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16); - qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16); - qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16); - - // Multiply by the weight of the matrix product (alpha) - if(multiply_alpha) - { - acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position); - acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position); - acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position); - acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position); - acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position); - acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position); - acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position); - acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position); - acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position); - acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position); - acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position); - acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position); - acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position); - acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position); - acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position); - acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position); - } - - const auto mtx_out0 = reinterpret_cast(out.ptr()); - - // Store 32x4 output elements - vst1_qs8(mtx_out0 + 0, acc00_qs8); - vst1_qs8(mtx_out0 + 8, acc01_qs8); - vst1_qs8(mtx_out0 + 16, acc02_qs8); - vst1_qs8(mtx_out0 + 24, acc03_qs8); - vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8); - vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8); - vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8); - vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8); - vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8); - vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8); - vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8); - vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8); - vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8); - vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8); - vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8); - vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8); - }, - ina, inb, out); -} - -template -void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) -{ - const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()); - const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type()); - const size_t out_stride2 = out_stride1 * 2; - const size_t out_stride3 = out_stride1 * 3; - const int num_elems_matrix_b_x = input1->info()->dimension(0); - const int fixed_point_position = input0->info()->fixed_point_position(); - const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position)); - ARM_COMPUTE_UNUSED(alpha_qs16); - - // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix - Window win_a(window); - win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); - win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); - - Window win_b; - // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(input1->info()->num_dimensions() >= 3) - { - win_b = window; - } - // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix - win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride)); - win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator ina(input0, win_a); - Iterator inb(input1, win_b); - Iterator out(output, window); - - // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW - // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 8x4 elements per iteration - // All the values needed for computing a single 8x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto mtx_a0 = reinterpret_cast(ina.ptr()); - auto mtx_b0 = reinterpret_cast(inb.ptr()); - auto mtx_b1 = mtx_b0 + in_b_stride; - - qint32x4_t acc00_qs32 = vdupq_n_qs32(0); - qint32x4_t acc10_qs32 = vdupq_n_qs32(0); - qint32x4_t acc20_qs32 = vdupq_n_qs32(0); - qint32x4_t acc30_qs32 = vdupq_n_qs32(0); - - qint32x4_t acc01_qs32 = vdupq_n_qs32(0); - qint32x4_t acc11_qs32 = vdupq_n_qs32(0); - qint32x4_t acc21_qs32 = vdupq_n_qs32(0); - qint32x4_t acc31_qs32 = vdupq_n_qs32(0); - - // This for loop performs 1 accumulation - for(int k = 0; k <= (num_elems_matrix_b_x - 8); k += 8) - { - const qint16x4_t a0 = vld1_dup_qs16(mtx_a0 + 0); - const qint16x4_t a1 = vld1_dup_qs16(mtx_a0 + 1); - const qint16x4_t a2 = vld1_dup_qs16(mtx_a0 + 2); - const qint16x4_t a3 = vld1_dup_qs16(mtx_a0 + 3); - - const qint16x4_t b00 = vld1_qs16(mtx_b0 + 0); - const qint16x4_t b01 = vld1_qs16(mtx_b0 + 4); - - acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position); - acc10_qs32 = vqmlal_qs16(acc10_qs32, b00, a1, fixed_point_position); - acc20_qs32 = vqmlal_qs16(acc20_qs32, b00, a2, fixed_point_position); - acc30_qs32 = vqmlal_qs16(acc30_qs32, b00, a3, fixed_point_position); - acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position); - acc11_qs32 = vqmlal_qs16(acc11_qs32, b01, a1, fixed_point_position); - acc21_qs32 = vqmlal_qs16(acc21_qs32, b01, a2, fixed_point_position); - acc31_qs32 = vqmlal_qs16(acc31_qs32, b01, a3, fixed_point_position); - - mtx_a0 += 4; - mtx_b0 += 8; - mtx_b1 += 8; - } - - // Convert back to qint16x4_t and saturate - qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32); - qint16x4_t acc10_qs16 = vqmovn_qs32(acc10_qs32); - qint16x4_t acc20_qs16 = vqmovn_qs32(acc20_qs32); - qint16x4_t acc30_qs16 = vqmovn_qs32(acc30_qs32); - - qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32); - qint16x4_t acc11_qs16 = vqmovn_qs32(acc11_qs32); - qint16x4_t acc21_qs16 = vqmovn_qs32(acc21_qs32); - qint16x4_t acc31_qs16 = vqmovn_qs32(acc31_qs32); - - // Multiply by the weight of the matrix product (alpha) - if(multiply_alpha) - { - acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position); - acc10_qs16 = vqmul_qs16(acc10_qs16, alpha_qs16, fixed_point_position); - acc20_qs16 = vqmul_qs16(acc20_qs16, alpha_qs16, fixed_point_position); - acc30_qs16 = vqmul_qs16(acc30_qs16, alpha_qs16, fixed_point_position); - acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position); - acc11_qs16 = vqmul_qs16(acc11_qs16, alpha_qs16, fixed_point_position); - acc21_qs16 = vqmul_qs16(acc21_qs16, alpha_qs16, fixed_point_position); - acc31_qs16 = vqmul_qs16(acc31_qs16, alpha_qs16, fixed_point_position); - } - - const auto mtx_out0 = reinterpret_cast(out.ptr()); - - // Store 8x4 output elements - vst1_qs16(mtx_out0 + 0, acc00_qs16); - vst1_qs16(mtx_out0 + 4, acc01_qs16); - vst1_qs16(mtx_out0 + out_stride1 + 0, acc10_qs16); - vst1_qs16(mtx_out0 + out_stride1 + 4, acc11_qs16); - vst1_qs16(mtx_out0 + out_stride2 + 0, acc20_qs16); - vst1_qs16(mtx_out0 + out_stride2 + 4, acc21_qs16); - vst1_qs16(mtx_out0 + out_stride3 + 0, acc30_qs16); - vst1_qs16(mtx_out0 + out_stride3 + 4, acc31_qs16); - }, - ina, inb, out); -} - inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); if(!is_interleaved) { @@ -1428,7 +822,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); } } else @@ -1467,7 +860,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i } ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast(m)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); } } @@ -1492,16 +884,6 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu num_elems_processed_per_iteration_x = 16; break; } - case DataType::QS8: - { - num_elems_processed_per_iteration_x = 32; - break; - } - case DataType::QS16: - { - num_elems_processed_per_iteration_x = 16; - break; - } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { @@ -1539,16 +921,6 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu num_elems_processed_per_iteration_x = 8; break; } - case DataType::QS8: - { - num_elems_processed_per_iteration_x = 32; - break; - } - case DataType::QS16: - { - num_elems_processed_per_iteration_x = 8; - break; - } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { @@ -1638,18 +1010,6 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf vector_matrix_multiply_f32(_input0, _input1, _output, window, info, _alpha); break; } - case DataType::QS8: - { - multiply_alpha ? vector_matrix_multiply_qs8(_input0, _input1, _output, window, info, _alpha) : - vector_matrix_multiply_qs8(_input0, _input1, _output, window, info, _alpha); - break; - } - case DataType::QS16: - { - multiply_alpha ? vector_matrix_multiply_qs16(_input0, _input1, _output, window, info, _alpha) : - vector_matrix_multiply_qs16(_input0, _input1, _output, window, info, _alpha); - break; - } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { @@ -1675,18 +1035,6 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf matrix_matrix_multiply_f32(_input0, _input1, _output, window, _alpha); break; } - case DataType::QS8: - { - multiply_alpha ? matrix_matrix_multiply_qs8(_input0, _input1, _output, window, _alpha) : - matrix_matrix_multiply_qs8(_input0, _input1, _output, window, _alpha); - break; - } - case DataType::QS16: - { - multiply_alpha ? matrix_matrix_multiply_qs16(_input0, _input1, _output, window, _alpha) : - matrix_matrix_multiply_qs16(_input0, _input1, _output, window, _alpha); - break; - } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp index c1e975e77e..8588f43edf 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp @@ -177,7 +177,6 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32)); ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1)); diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 5d6163d583..4517f46139 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2018 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -54,17 +54,15 @@ TensorShape get_output_shape(const ITensorInfo *input) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8, - DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; @@ -102,7 +100,7 @@ void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type()); // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info())); diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 86e3fd7a84..f03bc49ed3 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -24,7 +24,6 @@ #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/FixedPoint.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Size2D.h" @@ -47,9 +46,8 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); @@ -90,7 +88,6 @@ inline void linearize_volume(const uint8_t *const in_ptr, int input_stride_x, int input_stride_y, int input_stride_z, - int fixed_point_position, int pad_value, int dilation_x, int dilation_y) @@ -171,18 +168,7 @@ inline void linearize_volume(const uint8_t *const in_ptr, // Append 1 if the convolution layer has biases if(has_bias) { - if(std::is_same::value) - { - *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position); - } - else if(std::is_same::value) - { - *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position); - } - else - { - *out_ptr = static_cast(1); - } + *out_ptr = static_cast(1); } } } // namespace @@ -251,7 +237,6 @@ void NEIm2ColKernel::run_generic(const Window &window) input_stride_x, input_stride_y, input_stride_z, - _input->info()->fixed_point_position(), offset, _dilation.x(), _dilation.y()); @@ -294,18 +279,7 @@ void NEIm2ColKernel::run_reduced(const Window &window) // Add bias if(_has_bias) { - if(std::is_same::value) - { - *(reinterpret_cast(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position()); - } - else if(std::is_same::value) - { - *(reinterpret_cast(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position()); - } - else - { - *(reinterpret_cast(out_ptr) + out_width - 1) = static_cast(1); - } + *(reinterpret_cast(out_ptr) + out_width - 1) = static_cast(1); } } while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); @@ -366,12 +340,6 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size _func = &NEIm2ColKernel::run_reduced; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QS8: - _func = &NEIm2ColKernel::run_reduced; - break; - case DataType::QS16: - _func = &NEIm2ColKernel::run_reduced; - break; case DataType::QASYMM8: _func = &NEIm2ColKernel::run_reduced; break; @@ -392,12 +360,6 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic : &NEIm2ColKernel::run_generic; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - case DataType::QS8: - _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic : &NEIm2ColKernel::run_generic; - break; - case DataType::QS16: - _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic : &NEIm2ColKernel::run_generic; - break; case DataType::QASYMM8: _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic : &NEIm2ColKernel::run_generic; break; diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp index 91776d8100..ed037832af 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp @@ -103,7 +103,7 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); // Output auto initialization if not yet initialized - auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position()); + auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type()); AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum); diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp index 434f4eb3e9..d93dc09ff9 100644 --- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp @@ -68,7 +68,7 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe TensorShape output_shape = compute_min_max_shape(input); // Output auto initialization if not yet initialized - auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position()); + auto_init_if_empty(*output, output_shape, 1, input->data_type()); constexpr unsigned int num_elems_processed_per_iteration = 1; diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 776cb27d7a..253a93f196 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,26 +39,17 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); - if(is_data_type_fixed_point(input->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared); - ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input); - ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input); - ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input); - } - // Checks performed when output is configured if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; @@ -162,44 +153,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * } break; } - case DataType::QS8: - { - switch(norm_info.type()) - { - case NormType::IN_MAP_1D: - _func = &NENormalizationLayerKernel::normalize_fixed_point; - break; - case NormType::IN_MAP_2D: - // Normalize over X and Y - _func = &NENormalizationLayerKernel::normalize_fixed_point; - break; - case NormType::CROSS_MAP: - _func = &NENormalizationLayerKernel::normalize_fixed_point; - break; - default: - break; - } - break; - } - case DataType::QS16: - { - switch(norm_info.type()) - { - case NormType::IN_MAP_1D: - _func = &NENormalizationLayerKernel::normalize_fixed_point; - break; - case NormType::IN_MAP_2D: - // Normalize over X and Y - _func = &NENormalizationLayerKernel::normalize_fixed_point; - break; - case NormType::CROSS_MAP: - _func = &NENormalizationLayerKernel::normalize_fixed_point; - break; - default: - break; - } - break; - } default: ARM_COMPUTE_ERROR("NOT SUPPORTED!"); } @@ -306,105 +259,6 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) } } -template -void NENormalizationLayerKernel::normalize_fixed_point(const Window &window) -{ - Iterator input(_input, window); - Iterator input_squared(_input_squared, window); - Iterator output(_output, window); - - const int dim_y = 1; - const int radius = _norm_info.norm_size() / 2; - const int total_size = _input->info()->dimension(dim) - 1; - const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim]; - // We account padding across X only and we iterate over rows - const int min_left = (dim == 2) ? 0 : -static_cast(border_size().left); - const int max_right = (dim == 2) ? total_size : total_size + border_size().left; - const int min_top = 0; - const int max_bottom = _input->info()->dimension(dim_y) - 1; - - const int fixed_point_position = _input->info()->fixed_point_position(); - - if(dt == DataType::QS8) - { - const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position); - const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position); - const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position); - - execute_window_loop(window, [&](const Coordinates & id) - { - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int current_slice = id[dim]; - const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - const int first_slice = std::max(current_slice - radius, min_left); - const int last_slice = std::min(current_slice + radius, max_right); - - // Accumulate 2D In-Map values - qint8x16_t accu = vdupq_n_qs8(0); - for(int j = first_row; j <= last_row; ++j) - { - // Compute row displacement - const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; - const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); - for(int i = first_slice; i <= last_slice; ++i) - { - accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); - } - } - - // Normalize - const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position); - const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position); - const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast(input.ptr())), normalized, fixed_point_position); - vst1q_qs8(reinterpret_cast(output.ptr()), normalized_pixel); - }, - input, input_squared, output); - } - else if(dt == DataType::QS16) - { - const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position); - const qint16x8_t beta_vec = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position); - const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position); - - execute_window_loop(window, [&](const Coordinates & id) - { - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int current_slice = id[dim]; - const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - const int first_slice = std::max(current_slice - radius, min_left); - const int last_slice = std::min(current_slice + radius, max_right); - - // Accumulate 2D In-Map values - qint16x8_t accu = vdupq_n_qs16(0); - for(int j = first_row; j <= last_row; ++j) - { - // Compute row displacement - const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; - const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); - for(int i = first_slice; i <= last_slice; ++i) - { - accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); - } - } - - // Normalize - const qint16x8_t accu_scale = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position); - const qint16x8_t normalized = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position); - const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast(input.ptr())), normalized, fixed_point_position); - vst1q_qs16(reinterpret_cast(output.ptr()), normalized_pixel); - }, - input, input_squared, output); - } - else - { - ARM_COMPUTE_ERROR("Not supported"); - } -} - Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp index ae1d48cc69..e9bc8effc6 100644 --- a/src/core/NEON/kernels/NEPermuteKernel.cpp +++ b/src/core/NEON/kernels/NEPermuteKernel.cpp @@ -45,8 +45,8 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, - DataType::U16, DataType::S16, DataType::QS16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)), @@ -59,7 +59,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index 193ca3799c..0ec7e823a1 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -61,9 +61,9 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); @@ -71,14 +71,6 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type())) - { - // Check that all data types are the same and all fixed-point positions are the same - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output); - // Check if scale is representable in fixed-point with the provided settings - ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1); - } - if(std::abs(scale - scale255_constant) < 0.00001f) { ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); @@ -120,11 +112,6 @@ inline std::pair validate_and_configure_window(ITensorInfo *inpu { set_format_if_unknown(*output, Format::F16); } - else if(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8) - { - set_data_type_if_unknown(*output, DataType::QS8); - set_fixed_point_position_if_zero(*output, input1->fixed_point_position()); - } } // Configure kernel window @@ -219,105 +206,6 @@ void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict in } } -template -void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position) -{ - const auto output = static_cast(output_ptr); - - const qint8x16_t ta1 = vld1q_qs8(static_cast(input1_ptr)); - const qint8x16_t ta2 = vld1q_qs8(static_cast(input2_ptr)); - - if(is_scale255) - { - qint16x8_t tmp1_high = vmovl_s8(vget_high_s8(ta1)); - qint16x8_t tmp1_low = vmovl_s8(vget_low_s8(ta1)); - const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2)); - const qint16x8_t tmp2_low = vmovl_s8(vget_low_s8(ta2)); - - const float32x4x2_t scale255_f32 = - { - { - scale255_constant_f32q, - scale255_constant_f32q - } - }; - const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position); - - tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position); - tmp1_low = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position); - tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position); - tmp1_low = vmulq_qs16(tmp1_low, scale255, fixed_point_position); - - if(is_sat) - { - vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high))); - } - else - { - vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high))); - } - } - else - { - const qint8x16_t vn = vdupq_n_s8(-n); - qint8x16_t res = ta2; - - if(is_sat) - { - res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn); - } - else - { - res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn); - } - vst1q_qs8(output, res); - } -} - -template -void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position) -{ - const qint16x8x2_t ta1 = vld2q_qs16(static_cast(input1_ptr)); - qint16x8x2_t res = vld2q_qs16(static_cast(input2_ptr)); - - if(is_scale255) - { - const float32x4x2_t scale255_f32 = - { - { - scale255_constant_f32q, - scale255_constant_f32q - } - }; - const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position); - if(is_sat) - { - res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position); - res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position); - } - else - { - res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position); - res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position); - } - } - else - { - const qint16x8_t vn = vdupq_n_s16(-n); - if(is_sat) - { - res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn); - res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn); - } - else - { - res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn); - res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn); - } - } - vst2q_s16(static_cast(output_ptr), res); -} - template inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n) { @@ -529,7 +417,7 @@ void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict } // namespace NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel() - : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 } + : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 } { } @@ -550,7 +438,6 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe _scale = scale; _scale_exponent = 0; _func_int = nullptr; - _func_q_int = nullptr; _func_float = nullptr; bool is_scale_255 = false; @@ -630,28 +517,6 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe _func_int = is_sat ? &mul_U8_U8_S16_n : &mul_U8_U8_S16_n; } } - else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output) - { - if(is_scale_255) - { - _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n : &mul_QS8_QS8_QS8_n; - } - else - { - _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n : &mul_QS8_QS8_QS8_n; - } - } - else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output) - { - if(is_scale_255) - { - _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n : &mul_QS16_QS16_QS16_n; - } - else - { - _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n : &mul_QS16_QS16_QS16_n; - } - } else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output) { _func_float = &mul_F16_F16_F16_n; @@ -724,17 +589,6 @@ void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo }, input1, input2, output); } - else if(_func_q_int != nullptr) - { - int fixed_point_position = _input1->info()->fixed_point_position(); - execute_window_loop(collapsed, [&](const Coordinates & id) - { - (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position); - collapsed.slide_window_slice_3D(slice_input1); - collapsed.slide_window_slice_3D(slice_input2); - }, - input1, input2, output); - } else { ARM_COMPUTE_ERROR_ON(_func_float == nullptr); diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index 7877cf5cc0..e586b72d30 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -25,7 +25,6 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/FixedPoint.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEAsymm.h" @@ -79,32 +78,6 @@ inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, c return 1.f / ((end_y - start_y) * (end_x - start_x)); } -inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h, - int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position) -{ - static const std::array scale_values_q8 = - { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } }; - const int start_x = id.x() * stride_x - pad_x; - const int start_y = id.y() * stride_y - pad_y; - const int end_x = std::min(start_x + pool_size, upper_bound_w); - const int end_y = std::min(start_y + pool_size, upper_bound_h); - const int val = ((end_y - start_y) * (end_x - start_x)); - return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position)); -} - -inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h, - int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position) -{ - static std::array scale_values_q16 = - { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } }; - const int start_x = id.x() * stride_x - pad_x; - const int start_y = id.y() * stride_y - pad_y; - const int end_x = std::min(start_x + pool_size, upper_bound_w); - const int end_y = std::min(start_y + pool_size, upper_bound_h); - const int val = ((end_y - start_y) * (end_x - start_x)); - return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position)); -} - template inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step, const int pool_size, const int upper_bound_w, const int upper_bound_h, @@ -163,22 +136,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c int pool_stride_y = 0; PoolingType pool_type = pool_info.pool_type(); const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); - const bool exclude_padding = pool_info.exclude_padding(); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); static const std::set supported_pool_sizes = { 2, 3 }; - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type())); ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8)) && (pool_type != PoolingType::MAX)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2); - ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type())); if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w) || (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h)); @@ -236,22 +205,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen { switch(input->data_type()) { - case DataType::QS8: - num_elems_read_per_iteration = 16; - switch(pool_size_x) - { - case 2: - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; - break; - case 3: - num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; - break; - default: - break; - } - break; case DataType::QASYMM8: if(is_nhwc) { @@ -274,22 +227,6 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen break; } break; - case DataType::QS16: - num_elems_read_per_iteration = 8; - switch(pool_size_x) - { - case 2: - num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7; - break; - case 3: - num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8; - num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6; - break; - default: - break; - } - break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: if(is_nhwc) @@ -462,64 +399,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons const DataType data_type = input->info()->data_type(); const bool is_nchw = data_layout == DataLayout::NCHW; - // Select appropriate function - if(data_type == DataType::QS8) - { - if(_is_square) - { - switch(pool_size_x) - { - case 2: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling2_q8_nchw; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_q8_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 3: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling3_q8_nchw; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_q8_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - default: - switch(pool_type) - { - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - } - } - else - { - switch(pool_type) - { - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - } - } - else if(data_type == DataType::QASYMM8) + if(data_type == DataType::QASYMM8) { if(pool_size_x == 2 && pool_stride_x < 3 && _is_square) { @@ -606,62 +486,6 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons } } } - else if(data_type == DataType::QS16) - { - if(_is_square) - { - switch(pool_size_x) - { - case 2: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling2_q16_nchw; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling2_q16_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - case 3: - switch(pool_type) - { - case PoolingType::AVG: - _func = &NEPoolingLayerKernel::pooling3_q16_nchw; - break; - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::pooling3_q16_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - default: - switch(pool_type) - { - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - break; - } - } - else - { - switch(pool_type) - { - case PoolingType::MAX: - _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw; - break; - default: - ARM_COMPUTE_ERROR("Unsupported pooling type!"); - } - } - } else if(data_type == DataType::F16) { if(_is_square) @@ -1022,71 +846,6 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons INEKernel::configure(win_config.second); } -template -void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window) -{ - Iterator input(_input, window_input); - Iterator output(_output, window); - - const int fixed_point_position = _input->info()->fixed_point_position(); - constexpr int pool_size = 2; - int pool_stride_x = 0; - int pool_stride_y = 0; - const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); - const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); - const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); - const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); - const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right; - const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom; - - const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto top_data = vld1q_qs8(reinterpret_cast(input_top_ptr + input.offset())); - const auto bottom_data = vld1q_qs8(reinterpret_cast(input_bottom_ptr + input.offset())); - qint8x8_t lower_res = {}; - qint8x8_t upper_res = {}; - if(pooling_type == PoolingType::AVG) - { - // Calculate scale - const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position); - const qint8x8_t scale_vec = vdup_n_qs8(scale); - - // Perform pooling - const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data); - lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position); - if(pool_stride_x == 1) - { - const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1); - upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position); - } - } - else - { - const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data); - lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data)); - if(pool_stride_x == 1) - { - const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1); - upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted)); - } - } - if(pool_stride_x == 1) - { - const qint8x8x2_t res = { { lower_res, upper_res } }; - vst2_s8(reinterpret_cast(output.ptr()), res); - } - else - { - vst1_qs8(reinterpret_cast(output.ptr()), lower_res); - } - }, - input, output); -} - template void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window) { @@ -1201,71 +960,6 @@ void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, con input, output); } -template -void NEPoolingLayerKernel::pooling2_q16_nchw(const Window &window_input, const Window &window) -{ - Iterator input(_input, window_input); - Iterator output(_output, window); - - const int fixed_point_position = _input->info()->fixed_point_position(); - constexpr int pool_size = 2; - const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); - const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); - const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); - const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); - const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right; - const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom; - - const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto top_data = vld1q_qs16(reinterpret_cast(input_top_ptr + input.offset())); - const auto bottom_data = vld1q_qs16(reinterpret_cast(input_bottom_ptr + input.offset())); - qint16x4_t lower_res = {}; - qint16x4_t upper_res = {}; - if(pooling_type == PoolingType::AVG) - { - // Calculate scale - const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position); - const qint16x4_t scale_vec = vdup_n_qs16(scale); - - // Perform pooling - const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data); - lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position); - if(pool_stride_x == 1) - { - const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1); - upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position); - } - } - else - { - const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data); - lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data)); - if(pool_stride_x == 1) - { - const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1); - upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted)); - } - } - if(pool_stride_x == 1) - { - const qint16x4x2_t res = { { lower_res, upper_res } }; - vst2_s16(reinterpret_cast(output.ptr()), res); - } - else - { - vst1_qs16(reinterpret_cast(output.ptr()), lower_res); - } - }, - input, output); -} - template void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window) { @@ -1461,82 +1155,6 @@ void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const W input, output); } -template -void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window) -{ - Iterator input(_input, window_input); - Iterator output(_output, window); - - const int fixed_point_position = _input->info()->fixed_point_position(); - constexpr int pool_size = 3; - const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); - const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); - const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); - const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); - const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right; - const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom; - - const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto top_data = vld1q_qs8(reinterpret_cast(input_top_ptr + input.offset())); - const auto middle_data = vld1q_qs8(reinterpret_cast(input_middle_ptr + input.offset())); - const auto bottom_data = vld1q_qs8(reinterpret_cast(input_bottom_ptr + input.offset())); - qint8x8_t res = {}; - if(pooling_type == PoolingType::AVG) - { - // Calculate scale - const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position); - - // Perform pooling for stride 2 - const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data); - const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1); - const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2); - const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3); - if(pool_stride_x == 2) - { - const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } }; - static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; - const qint8x8_t scale_vec = vdup_n_qs8(scale); - res = vtbl2_s8(table, lookup_val); - res = vqmul_qs8(res, scale_vec, fixed_point_position); - vst1_qs8(reinterpret_cast(output.ptr()), res); - } - else - { - const qint8x16_t scale_vec = vdupq_n_qs8(scale); - vst1q_qs8(reinterpret_cast(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position)); - } - } - else - { - const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data); - const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1); - const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2); - const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3); - - if(pool_stride_x == 2) - { - const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } }; - static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; - res = vtbl2_s8(table, lookup_val); - vst1_qs8(reinterpret_cast(output.ptr()), res); - } - else - { - vst1q_qs8(reinterpret_cast(output.ptr()), final_max); - } - } - }, - input, output); -} - template void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window) { @@ -1657,77 +1275,6 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con input, output); } -template -void NEPoolingLayerKernel::pooling3_q16_nchw(const Window &window_input, const Window &window) -{ - Iterator input(_input, window_input); - Iterator output(_output, window); - - const int fixed_point_position = _input->info()->fixed_point_position(); - constexpr int pool_size = 3; - const int pool_pad_right = _pool_info.pad_stride_info().pad_right(); - const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); - const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); - const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); - const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right; - const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom; - - const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top))); - const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 1)); - const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_left), -static_cast(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto top_data = vld1q_qs16(reinterpret_cast(input_top_ptr + input.offset())); - const auto middle_data = vld1q_qs16(reinterpret_cast(input_middle_ptr + input.offset())); - const auto bottom_data = vld1q_qs16(reinterpret_cast(input_bottom_ptr + input.offset())); - - if(pooling_type == PoolingType::AVG) - { - // Calculate scale - const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position); - - // Perform pooling for stride 2 - const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data); - const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1); - const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2); - const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3); - if(pool_stride_x == 2) - { - const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) }; - const qint16x4_t scale_vec = vdup_n_qs16(scale); - vst1_qs16(reinterpret_cast(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position)); - } - else - { - const qint16x8_t scale_vec = vdupq_n_qs16(scale); - vst1q_qs16(reinterpret_cast(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position)); - } - } - else - { - const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data); - const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1); - const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2); - const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3); - - if(pool_stride_x == 2) - { - const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) }; - vst1_qs16(reinterpret_cast(output.ptr()), tmp); - } - else - { - vst1q_qs16(reinterpret_cast(output.ptr()), final_max); - } - } - }, - input, output); -} - template void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window) { @@ -1879,110 +1426,6 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W input, output); } -template -void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window) -{ - Iterator input(_input, window_input); - Iterator output(_output, window); - - const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; - const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; - const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); - const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); - - execute_window_loop(window, [&](const Coordinates & id) - { - qint8x16_t vres = {}; - qint8_t res = {}; - - //PoolingType::MAX - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 16); x += 16) - { - const qint8x16_t data = vld1q_qs8(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + - (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); - vres = vmaxq_s8(vres, data); - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - qint8_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); - res = std::max(res, data); - } - } - //Reduce - const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres)); - res = std::max(res, vget_lane_s8(half_vres, 0)); - res = std::max(res, vget_lane_s8(half_vres, 1)); - res = std::max(res, vget_lane_s8(half_vres, 2)); - res = std::max(res, vget_lane_s8(half_vres, 3)); - res = std::max(res, vget_lane_s8(half_vres, 4)); - res = std::max(res, vget_lane_s8(half_vres, 5)); - res = std::max(res, vget_lane_s8(half_vres, 6)); - res = std::max(res, vget_lane_s8(half_vres, 7)); - - // Store result - *(reinterpret_cast(output.ptr())) = res; - }, - input, output); -} - -template -void NEPoolingLayerKernel::poolingMxN_q16_nchw(const Window &window_input, const Window &window) -{ - Iterator input(_input, window_input); - Iterator output(_output, window); - - const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width; - const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height; - const int pool_pad_top = _pool_info.pad_stride_info().pad_top(); - const int pool_pad_left = _pool_info.pad_stride_info().pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); - - execute_window_loop(window, [&](const Coordinates & id) - { - qint16x8_t vres = {}; - qint16_t res = {}; - - //PoolingType::MAX - for(int y = 0; y < pool_size_y; ++y) - { - int x = 0; - for(; x <= (pool_size_x - 8); x += 8) - { - const qint16x8_t data = vld1q_qs16(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + - (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); - vres = vmaxq_s16(vres, data); - } - - // Leftover for loop - for(; x < pool_size_x; ++x) - { - qint16_t data = *(reinterpret_cast(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y())); - res = std::max(res, data); - } - } - //Reduce - const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres)); - res = std::max(res, vget_lane_s16(half_vres, 0)); - res = std::max(res, vget_lane_s16(half_vres, 1)); - res = std::max(res, vget_lane_s16(half_vres, 2)); - res = std::max(res, vget_lane_s16(half_vres, 3)); - - // Store result - *(reinterpret_cast(output.ptr())) = res; - }, - input, output); -} - template void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window) { @@ -2688,8 +2131,6 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) unsigned int window_x_inc = 0; switch(_input->info()->data_type()) { - case DataType::QS8: - case DataType::QS16: case DataType::F16: { window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp index ee23e76c5c..b49400ab7d 100644 --- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp @@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c std::tuple validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max) { // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0); + auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8); constexpr unsigned int num_elems_processed_per_iteration = 8; diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp index a209a523d3..4d908db77b 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -51,7 +51,7 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *r // Output auto inizialitation if not yet initialized TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values()); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type()); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height())); diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 30d42fa25f..30f21bbf33 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -134,7 +134,7 @@ std::tuple validate_and_configure_window(ITensorInfo *input, ITe const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis); // Output auto initialization if not yet initialized - auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position()); + auto_init_if_empty(*output, output_shape, 1, input->data_type()); unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type()); diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp index 45ba68d9fa..d6f470445f 100644 --- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp @@ -59,11 +59,10 @@ inline void reshape_tensor(const Window &window, const ITensor *input, ITensor * void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16, + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()); _input = input; @@ -94,12 +93,10 @@ void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info) case DataType::U8: case DataType::S8: case DataType::QASYMM8: - case DataType::QS8: reshape_tensor(window, _input, _output); break; case DataType::U16: case DataType::S16: - case DataType::QS16: case DataType::F16: reshape_tensor(window, _input, _output); break; diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index d91efd267f..9946f002de 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -194,56 +194,7 @@ T sqadd(T a, T b); template T sqsub(T a, T b); template -T sqmul(T a, T b, int fixed_point_position); - -#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU) \ - inline vec_8_byte_t vqsub(vec_8_byte_t a, vec_8_byte_t b) \ - { \ - return vqsub_##TAGT(a, b); \ - } \ - inline vec_8_byte_t vqadd(vec_8_byte_t a, vec_8_byte_t b) \ - { \ - return vqadd_##TAGU(a, b); \ - } \ - inline vec_16_byte_t vqadd(vec_16_byte_t a, vec_16_byte_t b) \ - { \ - return vqaddq_##TAGU(a, b); \ - } \ - inline vec_8_byte_t vqexp(vec_8_byte_t vec, int fixed_point_position) \ - { \ - return vqexp_q##TAGT(vec, fixed_point_position); \ - } \ - inline auto vmovl(vec_8_byte_t vec)->decltype(vmovl_##TAGT(vec)) \ - { \ - return vmovl_##TAGT(vec); \ - } \ - inline vec_16_byte_t vqrecip(vec_16_byte_t vec, int fixed_point_position) \ - { \ - return vqrecipq_q##TAGT(vec, fixed_point_position); \ - } \ - inline vec_16_byte_t vqmul(vec_16_byte_t a, vec_16_byte_t b, int fixed_point_position) \ - { \ - return vqmulq_q##TAGT(a, b, fixed_point_position); \ - } \ - template <> \ - inline TYPEU sqadd(TYPEU a, TYPEU b) \ - { \ - return sqadd_q##TAGU(a, b); \ - } \ - inline TYPET sqexp(TYPET val, int fixed_point_position) \ - { \ - return sqexp_q##TAGT(val, fixed_point_position); \ - } \ - template <> \ - inline TYPET sqsub(TYPET a, TYPET b) \ - { \ - return sqsub_q##TAGT(a, b); \ - } \ - template <> \ - inline TYPET sqmul(TYPET a, TYPET b, int fixed_point_position) \ - { \ - return sqmul_q##TAGT(a, b, fixed_point_position); \ - } +T sqmul(T a, T b); #define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG) \ inline vec_8_byte_t vadd(vec_8_byte_t a, vec_8_byte_t b) \ @@ -278,9 +229,6 @@ DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32) -DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16) -DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32) - #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ @@ -373,16 +321,15 @@ namespace Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ // Validate in case of configured output if(output.total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); } @@ -395,7 +342,7 @@ std::pair validate_and_configure_window_logits_1d_max(ITensorInf // Softmax across the x dimension const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1); // Output auto initialization if not yet initialized - auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info()); + auto_init_if_empty(output, output_shape, 1, input.data_type(), input.quantization_info()); // Configure kernel window const int input_width = input.valid_region().shape.x(); @@ -488,12 +435,6 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) case DataType::QASYMM8: _func = &logits_1d_max; break; - case DataType::QS8: - _func = &logits_1d_max; - break; - case DataType::QS16: - _func = &logits_1d_max; - break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &logits_1d_max; @@ -543,11 +484,12 @@ namespace Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max, const ITensorInfo &output, const float beta, const ITensorInfo &tmp) { + ARM_COMPUTE_UNUSED(beta); // Check input #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type()); @@ -555,7 +497,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor // Check max ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max); // Check output if configured @@ -564,19 +505,14 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info(); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization); } - // Check beta - ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type())); - // Check tmp if configured if(tmp.total_size() != 0) { const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type(); ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp); // We could potentially reduce tmp memory if we could predict or make an assumption // on the maximum number of threads that will run in parallel. ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp); @@ -727,88 +663,6 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons in_it, max_it, out_it); } -template -void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp, - ITensor &out, const float /*beta*/, const Window &window) -{ - const int start_x = in.info()->valid_region().anchor.x(); - const int input_width = in.info()->valid_region().shape.x(); - - const int fixed_point_position = in.info()->fixed_point_position(); - - Iterator in_it(&in, window); - Iterator max_it(&max, window); - Iterator out_it(&out, window); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - vec_16_byte_t vec_sum_inversed; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = vdup_n>(max_val); - - /* Init sum to zero */ - auto vec_sum = vdup_n>(0); - - /* Loop over row and compute exponentials and sum */ - int i = 0; - constexpr int vec_size = vec_size_of(vec_sum); - for(; i <= (input_width - vec_size); i += vec_size) - { - auto vec_elements = vld>(in_ptr + i); - vec_elements = vqsub(vec_elements, vec_max); - vec_elements = vqexp(vec_elements, fixed_point_position); - vec_sum = vqadd(vec_sum, vmovl(vec_elements)); - vst(tmp_ptr + i, vec_elements); - } - /* Reduce sum */ - const vec_8_byte_t sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum)); - U sum = reduce_add(sqadd, sum_8_byte); - - /* Run remaining elements */ - for(; i < input_width; ++i) - { - T element = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position); - sum = sqadd(sum, element); - tmp_ptr[i] = element; - } - - const auto qsum = utility::saturate_cast(sum); - vec_sum_inversed = vqrecip(vdup_n>(qsum), fixed_point_position); - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int i = 0; - constexpr int vec_size = vec_size_of(vec_sum_inversed); - for(; i <= (input_width - vec_size); i += vec_size) - { - const auto vec_in = vld>(tmp_ptr + i); - const vec_16_byte_t normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position); - vst(out_ptr + i, normalized_value); - } - - const T sum_inversed = vget_lane<0>(vec_sum_inversed); - - /* Run remaining elements */ - for(; i < input_width; ++i) - { - out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position); - } - } - }, - in_it, max_it, out_it); -} - template void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window) @@ -908,12 +762,6 @@ void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max case DataType::QASYMM8: _func = &logits_1d_softmax_qasymm8; break; - case DataType::QS8: - _func = &logits_1d_softmax_fixed_point; - break; - case DataType::QS16: - _func = &logits_1d_softmax_fixed_point; - break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &logits_1d_softmax_float; diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp index 92271378ff..2630159561 100644 --- a/src/core/NEON/kernels/NETransposeKernel.cpp +++ b/src/core/NEON/kernels/NETransposeKernel.cpp @@ -74,7 +74,7 @@ unsigned int num_elems_processed(size_t element_size) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); @@ -84,7 +84,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp index 3031a87637..f398409b26 100644 --- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -105,14 +105,13 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); if(biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type())); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1)); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2)); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3])); @@ -124,7 +123,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); } return Status{}; -- cgit v1.2.1