aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorVidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>2018-07-04 09:34:00 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:10 +0000
commit7485d5a62685cb745ab50e970adb722cb71557ac (patch)
treeba01b99ca466c93edc9a3f8c1e34394ff84be060 /src/core/NEON/kernels
parent014333d73883c3872e458cedda5ccef586a7ccd4 (diff)
downloadComputeLibrary-7485d5a62685cb745ab50e970adb722cb71557ac.tar.gz
COMPMID-970 : Remove QS8 / QS16 support
Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/NEActivationLayerKernel.cpp235
-rw-r--r--src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp54
-rw-r--r--src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp54
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp118
-rw-r--r--src/core/NEON/kernels/NECol2ImKernel.cpp7
-rw-r--r--src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp17
-rw-r--r--src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp187
-rw-r--r--src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp1
-rw-r--r--src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp1
-rw-r--r--src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEDequantizationLayerKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp310
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp116
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp8
-rw-r--r--src/core/NEON/kernels/NEFloorKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp30
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp61
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp654
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp1
-rw-r--r--src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp10
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp44
-rw-r--r--src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEMinMaxLayerKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NENormalizationLayerKernel.cpp150
-rw-r--r--src/core/NEON/kernels/NEPermuteKernel.cpp5
-rw-r--r--src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp154
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp563
-rw-r--r--src/core/NEON/kernels/NEQuantizationLayerKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEReshapeLayerKernel.cpp5
-rw-r--r--src/core/NEON/kernels/NESoftmaxLayerKernel.cpp166
-rw-r--r--src/core/NEON/kernels/NETransposeKernel.cpp3
-rw-r--r--src/core/NEON/kernels/NEWeightsReshapeKernel.cpp4
37 files changed, 142 insertions, 2846 deletions
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index ec125154a4..bdc93ed1b8 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -23,7 +23,6 @@
*/
#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEAsymm.h"
@@ -46,14 +45,13 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32);
// Checks performed when output is configured
if((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -146,36 +144,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
};
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
- // Activation functions : QS8
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs8 =
- {
- { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint8_t> },
- { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint8_t> },
- { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint8_t> },
- { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint8_t> },
- { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint8_t> },
- { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint8_t> },
- { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint8_t> },
- { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint8_t> },
- { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint8_t> },
- { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint8_t> },
- { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint8_t> },
- };
- // Activation functions : QS16
- static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qs16 =
- {
- { ActivationFunction::ABS, &NEActivationLayerKernel::activation<ActivationFunction::ABS, qint16_t> },
- { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation<ActivationFunction::LINEAR, qint16_t> },
- { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation<ActivationFunction::LOGISTIC, qint16_t> },
- { ActivationFunction::RELU, &NEActivationLayerKernel::activation<ActivationFunction::RELU, qint16_t> },
- { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::BOUNDED_RELU, qint16_t> },
- { ActivationFunction::LU_BOUNDED_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LU_BOUNDED_RELU, qint16_t> },
- { ActivationFunction::LEAKY_RELU, &NEActivationLayerKernel::activation<ActivationFunction::LEAKY_RELU, qint16_t> },
- { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation<ActivationFunction::SOFT_RELU, qint16_t> },
- { ActivationFunction::SQRT, &NEActivationLayerKernel::activation<ActivationFunction::SQRT, qint16_t> },
- { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation<ActivationFunction::SQUARE, qint16_t> },
- { ActivationFunction::TANH, &NEActivationLayerKernel::activation<ActivationFunction::TANH, qint16_t> },
- };
// Activation functions : QASYMM8
static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_qasymm8 =
{
@@ -188,12 +156,6 @@ void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, Activat
case DataType::QASYMM8:
_func = act_map_qasymm8[activation_info.activation()];
break;
- case DataType::QS8:
- _func = act_map_qs8[activation_info.activation()];
- break;
- case DataType::QS16:
- _func = act_map_qs16[activation_info.activation()];
- break;
case DataType::F32:
_func = act_map_f32[activation_info.activation()];
break;
@@ -508,70 +470,6 @@ typename std::enable_if<std::is_same<T, float>::value, void>::type NEActivationL
}
template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
-{
- Iterator input(_input, window);
- Iterator output(_output, window);
- const int fixed_point_position = _input->info()->fixed_point_position();
-
- static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
- const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
- const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
- const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const qint8x16_t in = vld1q_qs8(input_ptr);
- qint8x16_t tmp = {};
-
- switch(F)
- {
- case ActivationFunction::ABS:
- tmp = vqabsq_qs8(in);
- break;
- case ActivationFunction::LINEAR:
- tmp = vqmlaq_qs8(b, a, in, fixed_point_position);
- break;
- case ActivationFunction::LOGISTIC:
- tmp = vqrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position);
- break;
- case ActivationFunction::RELU:
- tmp = vmaxq_qs8(CONST_0, in);
- break;
- case ActivationFunction::BOUNDED_RELU:
- tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in));
- break;
- case ActivationFunction::LU_BOUNDED_RELU:
- tmp = vminq_qs8(a, vmaxq_qs8(b, in));
- break;
- case ActivationFunction::LEAKY_RELU:
- tmp = vbslq_s8(vcgtq_s8(in, CONST_0), in, vmulq_qs8(a, in, fixed_point_position));
- break;
- case ActivationFunction::SOFT_RELU:
- tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position);
- break;
- case ActivationFunction::SQRT:
- tmp = vqrecipq_qs8(vqinvsqrtq_qs8(in, fixed_point_position), fixed_point_position);
- break;
- case ActivationFunction::SQUARE:
- tmp = vqmulq_qs8(in, in, fixed_point_position);
- break;
- case ActivationFunction::TANH:
- tmp = vqmulq_qs8(a, vqtanhq_qs8(vqmulq_qs8(b, in, fixed_point_position), fixed_point_position), fixed_point_position);
- break;
- default:
- break;
- }
-
- vst1q_qs8(output_ptr, tmp);
- },
- input, output);
-}
-
-template <ActivationLayerInfo::ActivationFunction F, typename T>
typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
{
Iterator input(_input, window);
@@ -620,137 +518,6 @@ typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivat
input, output);
}
-template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
-{
- Iterator input(_input, window);
- Iterator output(_output, window);
- const int fixed_point_position = _input->info()->fixed_point_position();
-
- static const qint16x8_t CONST_0 = vdupq_n_qs16(0);
- const qint16x8_t CONST_1 = vdupq_n_qs16(sqcvt_qs16_f32(1.f, fixed_point_position));
- const qint16x8_t a = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.a(), fixed_point_position));
- const qint16x8_t b = vdupq_n_qs16(sqcvt_qs16_f32(_act_info.b(), fixed_point_position));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- const qint16x8x2_t in = vld2q_s16(input_ptr);
- qint16x8x2_t tmp = { {} };
-
- switch(F)
- {
- case ActivationFunction::ABS:
- tmp =
- {
- {
- vqabsq_qs16(in.val[0]),
- vqabsq_qs16(in.val[1]),
- }
- };
- break;
- case ActivationFunction::LINEAR:
- tmp =
- {
- {
- vqmlaq_qs16(b, a, in.val[0], fixed_point_position),
- vqmlaq_qs16(b, a, in.val[1], fixed_point_position),
- }
- };
- break;
- case ActivationFunction::LOGISTIC:
- tmp =
- {
- {
- vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[0]), fixed_point_position)), fixed_point_position),
- vqrecipq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(vnegq_s16(in.val[1]), fixed_point_position)), fixed_point_position),
- }
- };
- break;
- case ActivationFunction::RELU:
- tmp =
- {
- {
- vmaxq_qs16(CONST_0, in.val[0]),
- vmaxq_qs16(CONST_0, in.val[1]),
- }
- };
- break;
- case ActivationFunction::BOUNDED_RELU:
- tmp =
- {
- {
- vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[0])),
- vminq_qs16(a, vmaxq_qs16(CONST_0, in.val[1])),
- }
- };
- break;
- case ActivationFunction::LU_BOUNDED_RELU:
- tmp =
- {
- {
- vminq_qs16(a, vmaxq_qs16(b, in.val[0])),
- vminq_qs16(a, vmaxq_qs16(b, in.val[1])),
- }
- };
- break;
- case ActivationFunction::LEAKY_RELU:
- tmp =
- {
- {
- vbslq_s16(vcgtq_s16(in.val[0], CONST_0), in.val[0], vmulq_qs16(a, in.val[0], fixed_point_position)),
- vbslq_s16(vcgtq_s16(in.val[1], CONST_0), in.val[1], vmulq_qs16(a, in.val[1], fixed_point_position)),
- }
- };
- break;
- case ActivationFunction::SOFT_RELU:
- tmp =
- {
- {
- vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[0], fixed_point_position)), fixed_point_position),
- vlogq_qs16(vqaddq_qs16(CONST_1, vqexpq_qs16(in.val[1], fixed_point_position)), fixed_point_position),
- }
- };
- break;
- case ActivationFunction::SQRT:
- tmp =
- {
- {
- vqrecipq_qs16(vqinvsqrtq_qs16(in.val[0], fixed_point_position), fixed_point_position),
- vqrecipq_qs16(vqinvsqrtq_qs16(in.val[1], fixed_point_position), fixed_point_position),
- }
- };
- break;
- case ActivationFunction::SQUARE:
- tmp =
- {
- {
- vqmulq_qs16(in.val[0], in.val[0], fixed_point_position),
- vqmulq_qs16(in.val[1], in.val[1], fixed_point_position),
- }
- };
- break;
- case ActivationFunction::TANH:
- tmp =
- {
- {
- vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[0], fixed_point_position), fixed_point_position), fixed_point_position),
- vqmulq_qs16(a, vqtanhq_qs16(vqmulq_qs16(b, in.val[1], fixed_point_position), fixed_point_position), fixed_point_position),
- }
- };
- break;
- default:
- ARM_COMPUTE_ERROR("Function not implemented");
- break;
- }
-
- vst2q_qs16(output_ptr, tmp);
- },
- input, output);
-}
-
Status NEActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index a487090a98..f8e2b6d73e 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
@@ -48,38 +48,6 @@ namespace
{
constexpr unsigned int num_elems_processed_per_iteration = 16;
-void add_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
- const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vaddq_qs8(a, b));
- },
- input1, input2, output);
-}
-
-void add_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
- Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
- const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqaddq_qs8(a, b));
- },
- input1, input2, output);
-}
-
void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
@@ -362,28 +330,21 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
{
ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(input2.data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &input2);
- }
-
// Validate in case of configured output
if(output.total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(input1.data_type() == DataType::QS8 && input2.data_type() == DataType::QS8 && output.data_type() == DataType::QS8)
- && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
+ !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)
&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
- && !(input1.data_type() == DataType::QS16 && input2.data_type() == DataType::QS16 && output.data_type() == DataType::QS16)
&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
&& !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),
@@ -391,11 +352,6 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
"Wrong shape for output");
-
- if(is_data_type_fixed_point(input1.data_type()) || is_data_type_fixed_point(output.data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(&input1, &output);
- }
}
return Status{};
@@ -460,8 +416,6 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor
static std::map<std::string, AddFunction *> map_function =
{
- { "add_wrap_QS8_QS8_QS8", &add_wrap_QS8_QS8_QS8 },
- { "add_saturate_QS8_QS8_QS8", &add_saturate_QS8_QS8_QS8 },
{ "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 },
{ "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 },
{ "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 },
@@ -470,8 +424,6 @@ void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor
{ "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 },
{ "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 },
{ "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 },
- { "add_wrap_QS16_QS16_QS16", &add_wrap_S16_S16_S16 },
- { "add_saturate_QS16_QS16_QS16", &add_saturate_S16_S16_S16 },
{ "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 },
{ "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 },
{ "add_wrap_F32_F32_F32", &add_F32_F32_F32 },
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 3db80285c0..5a162e3b2c 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,38 +45,6 @@ class Coordinates;
namespace
{
-void sub_wrap_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
- const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vsubq_qs8(a, b));
- },
- input1, input2, output);
-}
-
-void sub_saturate_QS8_QS8_QS8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- Iterator input1(in1, window);
- Iterator input2(in2, window);
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t a = vld1q_qs8(reinterpret_cast<const qint8_t *>(input1.ptr()));
- const qint8x16_t b = vld1q_qs8(reinterpret_cast<const qint8_t *>(input2.ptr()));
-
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqsubq_qs8(a, b));
- },
- input1, input2, output);
-}
-
void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
{
Iterator input1(in1, window);
@@ -353,23 +321,15 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
{
ARM_COMPUTE_UNUSED(policy);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
-
- if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
- {
- // Check that all data types are the same and all fixed-point positions are the same
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
- }
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8 && output->data_type() == DataType::QS8)
- && !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
+ !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
&& !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
&& !(input1->data_type() == DataType::U8 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
&& !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
- && !(input1->data_type() == DataType::QS16 && input2->data_type() == DataType::QS16 && output->data_type() == DataType::QS16)
&& !(input1->data_type() == DataType::S16 && input2->data_type() == DataType::S16 && output->data_type() == DataType::S16)
&& !(input1->data_type() == DataType::F32 && input2->data_type() == DataType::F32 && output->data_type() == DataType::F32)
&& !(input1->data_type() == DataType::F16 && input2->data_type() == DataType::F16 && output->data_type() == DataType::F16),
@@ -432,8 +392,6 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens
static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
{
- { "sub_wrap_QS8_QS8_QS8", &sub_wrap_QS8_QS8_QS8 },
- { "sub_saturate_QS8_QS8_QS8", &sub_saturate_QS8_QS8_QS8 },
{ "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
{ "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
{ "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
@@ -442,8 +400,6 @@ void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITens
{ "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
{ "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
{ "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
- { "sub_wrap_QS16_QS16_QS16", &sub_wrap_S16_S16_S16 },
- { "sub_saturate_QS16_QS16_QS16", &sub_saturate_S16_S16_S16 },
{ "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
{ "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
{ "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 6be50fdb0d..6aed41f3aa 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -43,7 +43,7 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(epsilon);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16,
DataType::F32);
if(act_info.enabled())
@@ -60,22 +60,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
if(beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
}
if(gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
}
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
@@ -104,112 +100,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} //namespace
template <bool fused_activation>
-void NEBatchNormalizationLayerKernel::batch_normalization_qs8(const Window &window)
-{
- static_assert(!fused_activation, "Activation is not supported for QS8");
-
- Iterator input(_input, window);
- Iterator output(_output, window);
-
- // Hold information about the current feature map we are iterating.
- // Only compute denominator and NEON vectors once per feature map.
- int slice = -1;
-
- const int fixed_point_position = _input->info()->fixed_point_position();
- const auto input_mean = reinterpret_cast<const qint8_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const qint8_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
- qint8x16_t mean_vec = vdupq_n_qs8(0);
- qint8x16_t var_vec = vdupq_n_qs8(0);
- qint8x16_t gamma_vec = vdupq_n_qs8(sqcvt_qs8_f32(1, fixed_point_position));
- qint8x16_t beta_vec = vdupq_n_qs8(sqcvt_qs8_f32(0, fixed_point_position));
- qint8x16_t denominator = vdupq_n_qs8(0);
- const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position));
- execute_window_loop(window, [&](const Coordinates & id)
- {
- if(slice != id.z())
- {
- // Conctruct vectors
- mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
- var_vec = vdupq_n_qs8(*(input_var + id.z()));
- if(input_gamma != nullptr)
- {
- gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
- }
- if(input_beta != nullptr)
- {
- beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
- }
-
- // Calculate denominator
- denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
- slice = id.z();
- }
-
- // Calculate x bar and store results
- const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
- const qint8x16_t x_bar = vqmulq_qs8(numerator, denominator, fixed_point_position);
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
- },
- input, output);
-}
-
-template <bool fused_activation>
-void NEBatchNormalizationLayerKernel::batch_normalization_qs16(const Window &window)
-{
- static_assert(!fused_activation, "Activation is not supported for QS16");
-
- Iterator input(_input, window);
- Iterator output(_output, window);
-
- // Hold information about the current feature map we are iterating.
- // Only compute denominator and NEON vectors once per feature map.
- int slice = -1;
-
- const int fixed_point_position = _input->info()->fixed_point_position();
- const auto input_mean = reinterpret_cast<const qint16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const qint16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
- qint16x8_t mean_vec = vdupq_n_qs16(0);
- qint16x8_t var_vec = vdupq_n_qs16(0);
- qint16x8_t gamma_vec = vdupq_n_qs16(sqcvt_qs16_f32(1, fixed_point_position));
- qint16x8_t beta_vec = vdupq_n_qs16(sqcvt_qs16_f32(0, fixed_point_position));
- qint16x8_t denominator = vdupq_n_qs16(0);
- const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position));
- execute_window_loop(window, [&](const Coordinates & id)
- {
- if(slice != id.z())
- {
- // Conctruct vectors
- mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
- var_vec = vdupq_n_qs16(*(input_var + id.z()));
- if(input_gamma != nullptr)
- {
- gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
- }
- if(input_beta != nullptr)
- {
- beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
- }
-
- // Calculate denominator
- denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
- slice = id.z();
- }
-
- // Calculate x bar and store results
- const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), mean_vec);
- const qint16x8_t x_bar = vqmulq_qs16(numerator, denominator, fixed_point_position);
- vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position));
- },
- input, output);
-}
-
-template <bool fused_activation>
void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
{
static_assert(!fused_activation, "Activation is not supported for FP16");
@@ -406,12 +296,6 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
switch(_input->info()->data_type())
{
- case DataType::QS8:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs8<false>;
- break;
- case DataType::QS16:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16<false>;
- break;
case DataType::F16:
_func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
break;
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 9fda65feb4..d09d174e4f 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -50,8 +50,8 @@ TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_d
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
@@ -60,7 +60,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index b3746bddf2..e581f221a3 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
@@ -65,7 +65,7 @@ void NEConvertFullyConnectedWeightsKernel::configure(const ITensor *input, ITens
Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
DataLayout data_layout)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::QS32, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 891a03c5cc..38443ca4a8 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,10 +41,6 @@ using namespace arm_compute;
namespace
{
// Overloads of 128-bit vector loads
-uint8x16_t loadq(const uint8_t *ptr)
-{
- return vld1q_u8(ptr);
-}
uint16x8_t loadq(const uint16_t *ptr)
{
return vld1q_u16(ptr);
@@ -54,10 +50,6 @@ uint32x4_t loadq(const uint32_t *ptr)
return vld1q_u32(ptr);
}
// Overloads of 128-bit vector stores
-void storeq(uint8_t *ptr, uint8x16_t val)
-{
- return vst1q_u8(ptr, val);
-}
void storeq(uint16_t *ptr, uint16x8_t val)
{
return vst1q_u16(ptr, val);
@@ -107,9 +99,8 @@ BorderSize NEDepthConcatenateLayerKernel::border_size() const
void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
@@ -129,10 +120,6 @@ void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int
switch(input->info()->data_type())
{
- case DataType::QS8:
- _func = &depth_concat<uint8_t>;
- break;
- case DataType::QS16:
case DataType::F16:
_func = &depth_concat<uint16_t>;
break;
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index c29cb57513..8280b52fcb 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,13 +40,13 @@ class Coordinates;
} // namespace arm_compute
NEDepthConvertLayerKernel::NEDepthConvertLayerKernel()
- : _input(nullptr), _output(nullptr), _policy(), _shift(0), _fixed_point_position_input(0), _fixed_point_position_output(0)
+ : _input(nullptr), _output(nullptr), _policy(), _shift(0)
{
}
void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16);
_input = input;
_output = input;
@@ -58,48 +58,26 @@ void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, Conve
// Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
set_shape_if_empty(*output->info(), input->info()->tensor_shape());
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
// Set output
_output = output;
}
- // Set initial fixed point position of input and output
- _fixed_point_position_input = input->info()->fixed_point_position();
- _fixed_point_position_output = _output->info()->fixed_point_position();
-
- // Set the fixed point position to the output tensor if needed
- if(is_data_type_fixed_point(input->info()->data_type()) && is_data_type_fixed_point(_output->info()->data_type()))
- {
- // If in-place set the fixed point position of the output tensor to be equal to shift
- _fixed_point_position_output = (_input == _output) ? static_cast<int>(_shift) : _fixed_point_position_output;
- // Set fixed point position to output tensor
- _output->info()->set_fixed_point_position(_fixed_point_position_output);
- }
-
- ARM_COMPUTE_ERROR_ON(shift >= 8 && (!is_data_type_fixed_point(input->info()->data_type()) && !is_data_type_fixed_point(output->info()->data_type())));
+ ARM_COMPUTE_ERROR_ON(shift >= 8);
ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type())));
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
&& output->info()->data_type() != DataType::S32),
"Only data_types supported [in] U8 -> [out] U16, S16, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::F32),
- "Only data_types supported [in] QS8 -> [out] QS8, F32");
-
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
"Only data_types supported [in] U16 -> [out] U8, U32");
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
"Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && (output->info()->data_type() != DataType::QS16 && output->info()->data_type() != DataType::F32),
- "Only data_types supported [in] QS16 -> [out] QS16, F32");
-
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
- "Only data_types supported [in] F32 -> [out] QS8, QS16");
-
constexpr unsigned int num_elems_processed_per_iteration = 16;
// Configure kernel window
@@ -132,8 +110,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info
Iterator input(_input, window);
Iterator output(_output, window);
- bool in_place = (_input == _output);
-
switch(_input->info()->data_type())
{
case DataType::U8:
@@ -212,49 +188,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info
}
break;
}
- case DataType::QS8:
- {
- switch(_output->info()->data_type())
- {
- case DataType::QS8:
- {
- const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
- /* Fixed point position conversion QS8 -> QS8 */
- if(relative_shift != 0 || !in_place)
- {
- const auto relative_shift_vec = vdupq_n_qs8(relative_shift);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqrshlq_s8(texels_qs8, relative_shift_vec));
- },
- input, output);
- }
- break;
- }
- case DataType::F32:
- {
- /* Up-conversion QS8 -> F32 */
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t texels_qs8 = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr()));
-
- float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_qs8), _fixed_point_position_input);
- float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_qs8), _fixed_point_position_input);
-
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), texels_low.val[0]);
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, texels_low.val[1]);
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, texels_high.val[0]);
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, texels_high.val[1]);
- },
- input, output);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Output data type not supported");
- }
- break;
- }
case DataType::S16:
{
switch(_output->info()->data_type())
@@ -408,116 +341,6 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info
}
break;
}
- case DataType::QS16:
- {
- switch(_output->info()->data_type())
- {
- case DataType::QS16:
- {
- const int relative_shift = _fixed_point_position_output - _fixed_point_position_input;
- /* Fixed point position conversion QS16 -> QS16 */
- if(relative_shift != 0 || !in_place)
- {
- const auto relative_shift_vec = vdupq_n_qs16(relative_shift);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint16x8x2_t texels_qs16 =
- {
- {
- vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr())),
- vld1q_qs16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
- }
- };
- vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrshlq_s16(texels_qs16.val[0], relative_shift_vec));
- vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqrshlq_s16(texels_qs16.val[1], relative_shift_vec));
- },
- input, output);
- }
- break;
- }
- case DataType::F32:
- {
- /* Up-conversion QS16 -> F32 */
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int16x8x2_t texels_qs16 =
- {
- {
- vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
- vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
- }
- };
-
- vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels_qs16.val[0]), _fixed_point_position_input));
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[0]), _fixed_point_position_input));
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels_qs16.val[1]), _fixed_point_position_input));
- vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels_qs16.val[1]), _fixed_point_position_input));
- },
- input, output);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Output data type not supported");
- }
- break;
- }
- case DataType::F32:
- {
- switch(_output->info()->data_type())
- {
- case DataType::QS8:
- {
- /* Down-conversion F32 -> QS8 */
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const float32x4x4_t texels_f32 =
- {
- {
- vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
- vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
- vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
- vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
- }
- };
-
- const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, _fixed_point_position_output);
-
- vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
- },
- input, output);
- break;
- }
- case DataType::QS16:
- {
- /* Down-conversion F32 -> QS16 */
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const float32x4x2_t texels_f32_1 =
- {
- {
- vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
- vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
- }
- };
- const float32x4x2_t texels_f32_2 =
- {
- {
- vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
- vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
- }
- };
-
- vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, _fixed_point_position_output));
- vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, _fixed_point_position_output));
- },
- input, output);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Output data type not supported");
- }
- break;
- }
default:
ARM_COMPUTE_ERROR("Not supported");
}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 8cdf175d8a..09728e2a8d 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -115,7 +115,7 @@ public:
in_top += delta_input, in_mid += delta_input, in_low += delta_input,
p_out += num_elems_written_per_iteration)
{
- auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, 0, input_offset);
+ auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, input_offset);
store_results<stridex>(p_out, vres);
}
}
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index cfd8eacfdd..5b43e2b14f 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -122,7 +122,6 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 8960d8a8af..86a6d1c1a8 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -89,7 +89,6 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
_input = input;
_output = output;
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 36b17bfc4c..47fcf12874 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -88,7 +88,6 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
@@ -96,7 +95,6 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
if(biases != nullptr)
{
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
}
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 4120e5f87a..47c895c594 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
{
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32, 0);
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
constexpr unsigned int num_elems_processed_per_iteration = 8;
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 5eafdf0363..54a046846a 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -43,34 +43,6 @@ using namespace arm_compute::detail;
namespace
{
-template <unsigned int stridex>
-qint16x8_t internal_vld1q(const qint16_t *in);
-
-template <>
-qint16x8_t internal_vld1q<1>(const qint16_t *in)
-{
- return vld1q_qs16(in);
-}
-
-template <>
-qint16x8_t internal_vld1q<2>(const qint16_t *in)
-{
- const int16x8x2_t tmp = vld2q_s16(in);
- return tmp.val[0];
-}
-
-template <>
-qint16x8_t internal_vld1q<3>(const qint16_t *in)
-{
- const int16x8x3_t tmp = vld3q_s16(in);
- return tmp.val[0];
-}
-
-inline qint16x8_t internal_vdupq_n(qint16_t v)
-{
- return vdupq_n_qs16(v);
-}
-
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <unsigned int stridex>
float16x8_t internal_vld1q(const float16_t *in);
@@ -105,15 +77,13 @@ inline void internal_vst1q(float16_t *p, const float16x8_t &v)
vst1q_f16(p, v);
}
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y, int fixed_point_position)
+float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
return vmulq_f16(x, y);
}
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z, int fixed_point_position)
+inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
return vaddq_f16(x, vmulq_f16(y, z));
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -151,107 +121,16 @@ inline void internal_vst1q(float *p, const float32x4_t &v)
vst1q_f32(p, v);
}
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position)
+float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
return vmulq_f32(x, y);
}
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position)
+inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
return vmlaq_f32(x, y, z);
}
-template <unsigned int stridex>
-qint8x8_t internal_vld1q(const qint8_t *in);
-
-template <>
-qint8x8_t internal_vld1q<1>(const qint8_t *in)
-{
- return vld1_qs8(in);
-}
-
-template <>
-qint8x8_t internal_vld1q<2>(const qint8_t *in)
-{
- const qint8x8x2_t tmp = vld2_s8(in);
- return tmp.val[0];
-}
-
-template <>
-qint8x8_t internal_vld1q<3>(const qint8_t *in)
-{
- const qint8x8x3_t tmp = vld3_s8(in);
- return tmp.val[0];
-}
-
-inline qint8x8_t internal_vdupq_n(qint8_t v)
-{
- return vdup_n_qs8(v);
-}
-
-inline qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position)
-{
- return vmull_qs8(x, y, fixed_point_position);
-}
-
-inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position)
-{
- return vqmlal_qs8(x, y, z, fixed_point_position);
-}
-
-inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
-{
- vst1q_qs16(p, v);
-}
-
-inline void internal_vst1q(int32_t *p, const qint32x4x2_t &v)
-{
- vst1q_s32(p, v.val[0]);
- vst1q_s32(p + 4, v.val[1]);
-}
-
-template <unsigned int stridex>
-qint32x4x2_t internal_vld1q(const qint32_t *in);
-
-template <>
-qint32x4x2_t internal_vld1q<1>(const qint32_t *in)
-{
- const qint32x4x2_t r =
- {
- {
- vld1q_s32(in),
- vld1q_s32(in + 4)
- }
- };
- return r;
-}
-
-inline qint32x4x2_t internal_vmull(const qint16x8_t &x, const qint16x8_t &y, int fixed_point_position)
-{
- const qint32x4x2_t r =
- {
- {
- vmull_qs16(vget_low_s16(x), vget_low_s16(y), fixed_point_position),
- vmull_qs16(vget_high_s16(x), vget_high_s16(y), fixed_point_position),
- }
- };
- return r;
-}
-
-inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, const qint16x8_t &z, int fixed_point_position)
-{
- const qint32x4x2_t r =
- {
- {
- vqmlal_qs16(x.val[0], vget_low_s16(y), vget_low_s16(z), fixed_point_position),
- vqmlal_qs16(x.val[1], vget_high_s16(y), vget_high_s16(z), fixed_point_position)
- }
- };
- return r;
-}
-
constexpr int small_tensor_size_optim = 8;
inline bool run_optim_small_tensor_info(const ITensorInfo *t)
{
@@ -355,21 +234,20 @@ public:
static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = output->info()->dimension(0);
- const int output_h = output->info()->dimension(1);
- const int range_z = window.z().end() - window.z().start();
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
- const int fixed_point_position = input->info()->fixed_point_position();
+ const int input_stride_x = input->info()->strides_in_bytes().x();
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_w = output->info()->dimension(0);
+ const int output_h = output->info()->dimension(1);
+ const int range_z = window.z().end() - window.z().start();
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const unsigned int conv_pad_left = conv_info.pad_left();
+ const unsigned int conv_pad_top = conv_info.pad_top();
// setup output window for the iterator
Window window_out = window;
@@ -414,7 +292,7 @@ public:
auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
{
- internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+ internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
}
}
}
@@ -431,7 +309,7 @@ public:
auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
{
- internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));
+ internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
}
}
}
@@ -469,7 +347,7 @@ void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
template <unsigned int stridex>
float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position);
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
{
@@ -511,9 +389,8 @@ inline float32x4x3_t load_input(const float *const in)
template <>
inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
const float32x4x3_t vin0 = load_input(in_0);
const float32x4x3_t vin1 = load_input(in_1);
const float32x4x3_t vin2 = load_input(in_2);
@@ -601,10 +478,9 @@ inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const
template <>
inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
{
- ARM_COMPUTE_UNUSED(fixed_point_position);
- float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+ float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -613,9 +489,9 @@ inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const
template <>
inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)
+ const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
{
- float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);
+ float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
return out;
}
@@ -642,28 +518,6 @@ void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
}
-template <unsigned int stridex>
-void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);
-
-template <>
-void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
-{
- vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
- vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));
-}
-
-template <>
-void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
-{
- vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));
-}
-
-template <>
-void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
-{
- vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));
-}
-
template <typename T1>
class convolver_nhwc
{
@@ -745,7 +599,7 @@ public:
const auto we_addr = reinterpret_cast<const T1 *>(we_addr_base1 + x * kernel_stride_x);
const auto we_values = internal_vld1q<1>(we_addr);
- out_values = internal_vmlal(out_values, in_values, we_values, 0);
+ out_values = internal_vmlal(out_values, in_values, we_values);
}
out_val += out_values[0];
@@ -784,24 +638,23 @@ public:
const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
- const int kernel_stride_x = weights->info()->strides_in_bytes().x();
- const int kernel_stride_y = weights->info()->strides_in_bytes().y();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = output->info()->dimension(0);
- const int output_h = output->info()->dimension(1);
- const int num_planes_z = window.z().end() - window.z().start();
- const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
- const int fixed_point_position = input->info()->fixed_point_position();
+ const int input_stride_x = input->info()->strides_in_bytes().x();
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_x = weights->info()->strides_in_bytes().x();
+ const int kernel_stride_y = weights->info()->strides_in_bytes().y();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_w = output->info()->dimension(0);
+ const int output_h = output->info()->dimension(1);
+ const int num_planes_z = window.z().end() - window.z().start();
+ const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const unsigned int conv_pad_left = conv_info.pad_left();
+ const unsigned int conv_pad_top = conv_info.pad_top();
// setup output window for the iterator
Window window_out = window;
@@ -864,7 +717,7 @@ public:
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
{
- auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+ auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2);
store_results<stridex>(p_out, vres);
}
}
@@ -889,7 +742,7 @@ public:
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
{
- auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);
+ auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2);
accumulate_results<stridex>(p_out, vres);
}
}
@@ -908,24 +761,23 @@ public:
const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
- const int input_stride_x = input->info()->strides_in_bytes().x();
- const int input_stride_y = input->info()->strides_in_bytes().y();
- const int input_stride_z = input->info()->strides_in_bytes().z();
- const int output_stride_y = output->info()->strides_in_bytes().y();
- const int output_stride_z = output->info()->strides_in_bytes().z();
- const int kernel_stride_x = weights->info()->strides_in_bytes().x();
- const int kernel_stride_y = weights->info()->strides_in_bytes().y();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = output->info()->dimension(0);
- const int output_h = output->info()->dimension(1);
- const int num_planes_z = window.z().end() - window.z().start();
- const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
- const int fixed_point_position = input->info()->fixed_point_position();
+ const int input_stride_x = input->info()->strides_in_bytes().x();
+ const int input_stride_y = input->info()->strides_in_bytes().y();
+ const int input_stride_z = input->info()->strides_in_bytes().z();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+ const int output_stride_z = output->info()->strides_in_bytes().z();
+ const int kernel_stride_x = weights->info()->strides_in_bytes().x();
+ const int kernel_stride_y = weights->info()->strides_in_bytes().y();
+ const int kernel_stride_z = weights->info()->strides_in_bytes().z();
+ const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
+ const int output_w = output->info()->dimension(0);
+ const int output_h = output->info()->dimension(1);
+ const int num_planes_z = window.z().end() - window.z().start();
+ const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);
+ const int kernel_depth = weights->info()->dimension(Window::DimZ);
+ const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+ const unsigned int conv_pad_left = conv_info.pad_left();
+ const unsigned int conv_pad_top = conv_info.pad_top();
// setup output window for the iterator
Window window_out = window;
@@ -976,7 +828,7 @@ public:
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
{
- auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+ auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
store_results<stridex>(p_out, vres);
}
}
@@ -1001,7 +853,7 @@ public:
for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
{
- auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);
+ auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
accumulate_results<stridex>(p_out, vres);
}
}
@@ -1120,7 +972,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
const DataLayout data_layout = input->data_layout();
@@ -1140,11 +992,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
DataType data_type = input->data_type();
- if(is_data_type_fixed_point(data_type))
- {
- // Promote data type in case of fixed point
- data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
- }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);
@@ -1180,11 +1027,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- case DataType::QS16:
num_elems_written_per_iteration = 8;
break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
if(run_optim_small_tensor_info(input))
{
@@ -1215,13 +1060,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- case DataType::QS16:
num_weight_elems_read_per_row = 8 + kernel_size - 1;
num_elems_read_per_iteration = 24;
num_elems_written_per_iteration = 32 >> conv_stride_x;
break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
ARM_COMPUTE_ERROR("Data type not supported.");
break;
@@ -1315,14 +1158,8 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
DataType data_type = input->info()->data_type();
- if(is_data_type_fixed_point(data_type))
- {
- // Promote data type in case of fixed point
- data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);
- }
-
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), output_shape, 1, data_type);
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));
@@ -1371,12 +1208,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
{
switch(_input->info()->data_type())
{
- case DataType::QS8:
- convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
- case DataType::QS16:
- convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
case DataType::F32:
convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
break;
@@ -1395,9 +1226,6 @@ void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo
{
switch(_input->info()->data_type())
{
- case DataType::QS8:
- convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- break;
case DataType::F32:
convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
break;
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 37a3804289..e4cd4d0465 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -45,22 +45,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
- DataType::QS16, DataType::F16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8,
+ DataType::F16,
DataType::QS32, DataType::S32, DataType::F32);
if(bias != nullptr)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F16, DataType::QS32, DataType::S32, DataType::F32);
- if(is_data_type_fixed_point(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
- }
- else if(is_data_type_quantized_asymmetric(input->data_type()))
+ if(is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -80,17 +73,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
// Checks performed when output is configured
if((output != nullptr) && (output->total_size() != 0))
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- if(is_data_type_fixed_point(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && output->data_type() != DataType::QS8, "Wrong data type for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && output->data_type() != DataType::QS16, "Wrong data type for output");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
- }
- else if(is_data_type_quantized_asymmetric(output->data_type()))
+ if(is_data_type_quantized_asymmetric(output->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias");
}
@@ -168,81 +154,24 @@ inline float32x4_t internal_vld1q(const float *in)
{
return vld1q_f32(in);
}
-inline qint8x16_t internal_vld1q(const qint8_t *in)
-{
- return vld1q_qs8(in);
-}
-inline qint16x8_t internal_vld1q(const qint16_t *in)
-{
- return vld1q_qs16(in);
-}
-inline qint32x4_t internal_vld1q(const qint32_t *in)
-{
- return vld1q_s32(in);
-}
// Internal store
inline void internal_vst1q(float *p, const float32x4_t &v)
{
vst1q_f32(p, v);
}
-inline void internal_vst1q(qint8_t *p, const qint8x16_t &v)
-{
- vst1q_qs8(p, v);
-}
-inline void internal_vst1q(qint8_t *p, const qint16x8_t &v)
-{
- vst1_qs8(p, vqmovn_s16(v));
-}
-inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
-{
- vst1q_qs16(p, v);
-}
-inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
-{
- vst1q_s32(p, v);
-}
-
-inline void internal_vst1q(qint16_t *p, const qint32x4_t &v)
-{
- vst1_qs16(p, vqmovn_qs32(v));
-}
// Internal vdup
inline float32x4_t internal_vdupq_n(float v)
{
return vdupq_n_f32(v);
}
-inline qint8x16_t internal_vdupq_n(qint8_t v)
-{
- return vdupq_n_qs8(v);
-}
-inline qint16x8_t internal_vdupq_n(qint16_t v)
-{
- return vdupq_n_qs16(v);
-}
-inline qint32x4_t internal_vdupq_n(qint32_t v)
-{
- return vdupq_n_qs32(v);
-}
// Internal vadd
inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y)
{
return vaddq_f32(x, y);
}
-inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y)
-{
- return vqaddq_qs8(x, y);
-}
-inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
-{
- return vqaddq_qs16(x, y);
-}
-inline qint32x4_t internal_vqaddq(const qint32x4_t &x, const qint32x4_t &y)
-{
- return vqaddq_qs32(x, y);
-}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
inline float16x8_t internal_vld1q(const float16_t *in)
@@ -494,39 +423,6 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
{
switch(input->info()->data_type())
{
- case DataType::QS8:
- {
- if(bias == nullptr)
- {
- _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, false> : &output_stage<qint8_t, qint8_t, false, false>;
- }
- else
- {
- _func = (output == nullptr) ? &output_stage<qint8_t, qint8_t, true, true> : &output_stage<qint8_t, qint8_t, false, true>;
- }
- break;
- }
- case DataType::QS16:
- {
- if(bias != nullptr && bias->info()->data_type() == DataType::QS8)
- {
- _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, true> : &output_stage<qint16_t, qint8_t, false, true>;
- }
- else if(bias == nullptr)
- {
- _func = (output == nullptr) ? &output_stage<qint16_t, qint8_t, true, false> : &output_stage<qint16_t, qint8_t, false, false>;
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
- break;
- }
- case DataType::QS32:
- {
- _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
- break;
- }
case DataType::S32:
{
_func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 747b8b1bfe..3d08cafa93 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -105,8 +105,8 @@ NEFillBorderKernel::NEFillBorderKernel()
void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QASYMM8,
- DataType::QS16, DataType::U16, DataType::S16,
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
@@ -147,7 +147,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
case DataType::U8:
fill_constant_value_single_channel<uint8_t>(window);
break;
- case DataType::QS8:
case DataType::S8:
fill_constant_value_single_channel<int8_t>(window);
break;
@@ -155,7 +154,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
fill_constant_value_single_channel<uint16_t>(window);
break;
case DataType::S16:
- case DataType::QS16:
fill_constant_value_single_channel<int16_t>(window);
break;
case DataType::U32:
@@ -192,7 +190,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
case DataType::U8:
fill_replicate_single_channel<uint8_t>(window);
break;
- case DataType::QS8:
case DataType::S8:
fill_replicate_single_channel<int8_t>(window);
break;
@@ -200,7 +197,6 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
fill_replicate_single_channel<uint16_t>(window);
break;
case DataType::S16:
- case DataType::QS16:
fill_replicate_single_channel<int16_t>(window);
break;
case DataType::U32:
diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 72b652d5dc..872ac2661e 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -40,7 +40,7 @@ void NEFloorKernel::configure(const ITensor *input, ITensor *output)
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Auto initialize output
- auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 12755a45f8..6519a39b9c 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -44,11 +44,10 @@ namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
- DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
if(output->total_size() != 0)
{
@@ -57,7 +56,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
index cab3c7a58f..421a6f0ef9 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -43,9 +43,8 @@ namespace
{
inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(biases, accum);
ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
@@ -161,33 +160,6 @@ void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadI
break;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const qint8x16_t accum = vld1q_qs8(reinterpret_cast<const qint8_t *>(in0_out.ptr()));
- const qint8x16_t biases = vld1q_qs8(reinterpret_cast<const qint8_t *>(in1.ptr()));
-
- vst1q_qs8(reinterpret_cast<qint8_t *>(in0_out.ptr()), vqaddq_qs8(accum, biases));
- },
- in0_out, in1);
- break;
- }
- case DataType::QS16:
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- qint16x8x2_t accum = vld2q_s16(reinterpret_cast<const qint16_t *>(in0_out.ptr()));
- const qint16x8x2_t biases = vld2q_s16(reinterpret_cast<const qint16_t *>(in1.ptr()));
-
- accum.val[0] = vqaddq_qs16(accum.val[0], biases.val[0]);
- accum.val[1] = vqaddq_qs16(accum.val[1], biases.val[1]);
-
- vst2q_s16(reinterpret_cast<qint16_t *>(in0_out.ptr()), accum);
- },
- in0_out, in1);
- break;
- }
default:
ARM_COMPUTE_ERROR("Data type not supported");
break;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index dfba74355b..d02504329a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -91,54 +91,6 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
-{
- const int fixed_point_position = input->info()->fixed_point_position();
- const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
-
- Iterator in(input, window);
- Iterator out(output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const qint8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<qint8_t *>(out.ptr());
-
- qint8x16_t alpha_ab = vld1q_qs8(out_ptr);
- const qint8x16_t c = vld1q_qs8(in_ptr);
-
- // Multiply matrix C by its weight and accumulate
- alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position);
-
- vst1q_qs8(out_ptr, alpha_ab);
- },
- in, out);
-}
-
-void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
-{
- const int fixed_point_position = input->info()->fixed_point_position();
- const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
-
- Iterator in(input, window);
- Iterator out(output, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const qint16_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<qint16_t *>(out.ptr());
-
- qint16x8x2_t alpha_ab = vld2q_s16(out_ptr);
- const qint16x8x2_t c = vld2q_s16(in_ptr);
-
- // Multiply matrix C by its weight and accumulate
- alpha_ab.val[0] = vqmlaq_qs16(alpha_ab.val[0], c.val[0], beta_qs16, fixed_point_position);
- alpha_ab.val[1] = vqmlaq_qs16(alpha_ab.val[1], c.val[1], beta_qs16, fixed_point_position);
-
- vst2q_s16(out_ptr, alpha_ab);
- },
- in, out);
-}
} // namespace
NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
@@ -148,10 +100,9 @@ NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
@@ -160,12 +111,6 @@ void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output
case DataType::F32:
_func = &matrix_addition_f32;
break;
- case DataType::QS8:
- _func = &matrix_addition_qs8;
- break;
- case DataType::QS16:
- _func = &matrix_addition_qs16;
- break;
case DataType::F16:
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
_func = &matrix_addition_f16;
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index 69b052a9bd..196398a2de 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -356,263 +356,6 @@ void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT
}
template <bool multiply_alpha>
-void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
-{
- const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
- const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
- const int fixed_point_position = input0->info()->fixed_point_position();
-
- // The implementation computes 32 elements per iteration
- const int window_start_x = 32 * info.thread_id;
- const int window_step_x = 32 * info.num_threads;
- // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
- const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-
- Window win_out(window);
- win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
- win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- Window win_a(window);
- win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Window win_b;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(input1->info()->num_dimensions() >= 3)
- {
- win_b = window;
- }
- win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
- win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- Iterator ina(input0, win_a);
- Iterator inb(input1, win_b);
- Iterator out(output, win_out);
-
- execute_window_loop(win_out, [&](const Coordinates & id)
- {
- if(id.x() > width_matrix_b)
- {
- return;
- }
-
- // Reset accumulators
- qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
-
- auto vec_a = reinterpret_cast<const qint8_t *>(ina.ptr());
- auto matrix_b = reinterpret_cast<const qint8_t *>(inb.ptr());
-
- auto vec_a_end_addr = vec_a + num_elems_vec_a;
- for(; vec_a <= (vec_a_end_addr - 2);)
- {
- const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0);
- const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1);
-
- const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride);
- const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride);
- const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride);
- const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride);
- const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride);
- const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride);
- const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride);
- const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride);
-
- // First accumulation
- acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
- acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
- acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
- acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
-
- // Second accumulation
- acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position);
- acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position);
- acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position);
- acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position);
-
- vec_a += 2;
- matrix_b += 2 * in_b_stride;
- }
-
- for(; vec_a < vec_a_end_addr;)
- {
- const qint8x8_t a0 = vld1_dup_qs8(vec_a);
-
- const qint8x8_t b00 = vld1_qs8(matrix_b + 0);
- const qint8x8_t b01 = vld1_qs8(matrix_b + 8);
- const qint8x8_t b02 = vld1_qs8(matrix_b + 16);
- const qint8x8_t b03 = vld1_qs8(matrix_b + 24);
-
- acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
- acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
- acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position);
- acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position);
-
- vec_a += 1;
- matrix_b += in_b_stride;
- }
-
- // Convert back to qint8x8_t and saturate
- qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
- qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
- qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
- qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
-
- // Multiply by the weight of the matrix product (alpha)
- if(multiply_alpha)
- {
- const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
- acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
- acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
- acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
- acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
- }
-
- const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
-
- // Store 8x4 output elements
- vst1_qs8(mtx_out0 + 0, acc00_qs8);
- vst1_qs8(mtx_out0 + 8, acc01_qs8);
- vst1_qs8(mtx_out0 + 16, acc02_qs8);
- vst1_qs8(mtx_out0 + 24, acc03_qs8);
- },
- ina, inb, out);
-}
-
-template <bool multiply_alpha>
-void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, const ThreadInfo &info, float alpha)
-{
- const auto width_matrix_b = static_cast<int>(output->info()->dimension(0));
- const auto in_b_stride = static_cast<int>(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()));
- const auto num_elems_vec_a = static_cast<int>(input0->info()->dimension(0));
- const int fixed_point_position = input0->info()->fixed_point_position();
-
- // The implementation computes 16 elements per iteration
- const int window_start_x = 16 * info.thread_id;
- const int window_step_x = 16 * info.num_threads;
- // Make sure (window_end_x - window_start_x) is a multiple of window_step_x
- const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
- ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
-
- Window win_out(window);
- win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
- win_out.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- Window win_a(window);
- win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_a.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Window win_b;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(input1->info()->num_dimensions() >= 3)
- {
- win_b = window;
- }
- win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));
- win_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- Iterator ina(input0, win_a);
- Iterator inb(input1, win_b);
- Iterator out(output, win_out);
-
- execute_window_loop(win_out, [&](const Coordinates & id)
- {
- if(id.x() > width_matrix_b)
- {
- return;
- }
-
- // Reset accumulators
- qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc02_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc03_qs32 = vdupq_n_qs32(0);
-
- auto vec_a = reinterpret_cast<const qint16_t *>(ina.ptr());
- auto matrix_b = reinterpret_cast<const qint16_t *>(inb.ptr());
-
- auto vec_a_end_addr = vec_a + num_elems_vec_a;
- for(; vec_a <= (vec_a_end_addr - 2);)
- {
- const qint16x4_t a0 = vld1_dup_qs16(vec_a + 0);
- const qint16x4_t a1 = vld1_dup_qs16(vec_a + 1);
-
- const qint16x4_t b00 = vld1_qs16(matrix_b + 0 + 0 * in_b_stride);
- const qint16x4_t b01 = vld1_qs16(matrix_b + 4 + 0 * in_b_stride);
- const qint16x4_t b02 = vld1_qs16(matrix_b + 8 + 0 * in_b_stride);
- const qint16x4_t b03 = vld1_qs16(matrix_b + 12 + 0 * in_b_stride);
- const qint16x4_t b10 = vld1_qs16(matrix_b + 0 + 1 * in_b_stride);
- const qint16x4_t b11 = vld1_qs16(matrix_b + 4 + 1 * in_b_stride);
- const qint16x4_t b12 = vld1_qs16(matrix_b + 8 + 1 * in_b_stride);
- const qint16x4_t b13 = vld1_qs16(matrix_b + 12 + 1 * in_b_stride);
-
- // First accumulation
- acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
- acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
- acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
- acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
-
- // Second accumulation
- acc00_qs32 = vqmlal_qs16(acc00_qs32, b10, a1, fixed_point_position);
- acc01_qs32 = vqmlal_qs16(acc01_qs32, b11, a1, fixed_point_position);
- acc02_qs32 = vqmlal_qs16(acc02_qs32, b12, a1, fixed_point_position);
- acc03_qs32 = vqmlal_qs16(acc03_qs32, b13, a1, fixed_point_position);
-
- vec_a += 2;
- matrix_b += 2 * in_b_stride;
- }
-
- for(; vec_a < vec_a_end_addr;)
- {
- const qint16x4_t a0 = vld1_dup_qs16(vec_a);
-
- const qint16x4_t b00 = vld1_qs16(matrix_b + 0);
- const qint16x4_t b01 = vld1_qs16(matrix_b + 4);
- const qint16x4_t b02 = vld1_qs16(matrix_b + 8);
- const qint16x4_t b03 = vld1_qs16(matrix_b + 12);
-
- acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
- acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
- acc02_qs32 = vqmlal_qs16(acc02_qs32, b02, a0, fixed_point_position);
- acc03_qs32 = vqmlal_qs16(acc03_qs32, b03, a0, fixed_point_position);
-
- vec_a += 1;
- matrix_b += in_b_stride;
- }
-
- // Convert back to qint16x4_t and saturate
- qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
- qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
- qint16x4_t acc02_qs16 = vqmovn_qs32(acc02_qs32);
- qint16x4_t acc03_qs16 = vqmovn_qs32(acc03_qs32);
-
- // Multiply by the weight of the matrix product (alpha)
- if(multiply_alpha)
- {
- const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
- acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
- acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
- acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
- acc03_qs16 = vqmul_qs16(acc03_qs16, alpha_qs16, fixed_point_position);
- }
-
- const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
-
- // Store 16x4 output elements
- vst1_qs16(mtx_out0 + 0, acc00_qs16);
- vst1_qs16(mtx_out0 + 4, acc01_qs16);
- vst1_qs16(mtx_out0 + 8, acc02_qs16);
- vst1_qs16(mtx_out0 + 12, acc03_qs16);
- },
- ina, inb, out);
-}
-
-template <bool multiply_alpha>
void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
{
const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
@@ -1063,361 +806,12 @@ void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, IT
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
}
-template <bool multiply_alpha>
-void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
-{
- const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
- const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
- const size_t out_stride2 = out_stride1 * 2;
- const size_t out_stride3 = out_stride1 * 3;
- const int num_elems_matrix_b_x = input1->info()->dimension(0);
- const int fixed_point_position = input0->info()->fixed_point_position();
- const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
- ARM_COMPUTE_UNUSED(alpha_qs8);
-
- // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
- Window win_a(window);
- win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
- Window win_b;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(input1->info()->num_dimensions() >= 3)
- {
- win_b = window;
- }
- // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
- // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4
- win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride));
- win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Iterator ina(input0, win_a);
- Iterator inb(input1, win_b);
- Iterator out(output, window);
-
- // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
- // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
- // All the values needed for computing a single 32x4 block will be read from consecutive memory positions
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto mtx_a0 = reinterpret_cast<const qint8_t *>(ina.ptr());
- auto mtx_b0 = reinterpret_cast<const qint8_t *>(inb.ptr());
- auto mtx_b1 = mtx_b0 + in_b_stride;
-
- qint16x8_t acc00_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc10_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc20_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc30_qs16 = vdupq_n_qs16(0);
-
- qint16x8_t acc01_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc11_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc21_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc31_qs16 = vdupq_n_qs16(0);
-
- qint16x8_t acc02_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc12_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc22_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc32_qs16 = vdupq_n_qs16(0);
-
- qint16x8_t acc03_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc13_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc23_qs16 = vdupq_n_qs16(0);
- qint16x8_t acc33_qs16 = vdupq_n_qs16(0);
-
- int k = 0;
- // This for loop performs 2 accumulations
- for(; k <= (num_elems_matrix_b_x - 32); k += 32)
- {
- const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
- const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
- const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
- const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
- const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4);
- const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5);
- const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6);
- const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7);
-
- const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
- const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
- const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
- const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
-
- // First accumulation
- acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
- acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
- acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
- acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
- acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
- acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
- acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
- acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
-
- const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16);
- const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24);
- const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16);
- const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24);
-
- acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
- acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
- acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
- acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
- acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
- acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
- acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
- acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
-
-#if __arm__
- asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
- asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
- asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
- // Second accumulation
- acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position);
- acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position);
- acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position);
- acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position);
- acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position);
- acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position);
- acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position);
- acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position);
- acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position);
- acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position);
- acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position);
- acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position);
- acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position);
- acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position);
- acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position);
- acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position);
-
- mtx_a0 += 8;
- mtx_b0 += 32;
- mtx_b1 += 32;
- }
-
- // This for loop performs the left over accumulations
- for(; k < num_elems_matrix_b_x; k += 16)
- {
- const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0);
- const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1);
- const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2);
- const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3);
-
- const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0);
- const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8);
- const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0);
- const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8);
-
- acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position);
- acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position);
- acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position);
- acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position);
- acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position);
- acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position);
- acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position);
- acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position);
- acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position);
- acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position);
- acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position);
- acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position);
- acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position);
- acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position);
- acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position);
- acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position);
-
- mtx_a0 += 4;
- mtx_b0 += 16;
- mtx_b1 += 16;
- }
-
- // Convert back to qint8x8_t and saturate
- qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16);
- qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16);
- qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16);
- qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16);
-
- qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16);
- qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16);
- qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16);
- qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16);
-
- qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16);
- qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16);
- qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16);
- qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16);
-
- qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16);
- qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16);
- qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16);
- qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16);
-
- // Multiply by the weight of the matrix product (alpha)
- if(multiply_alpha)
- {
- acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
- acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position);
- acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position);
- acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position);
- acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
- acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position);
- acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position);
- acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position);
- acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
- acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position);
- acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position);
- acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position);
- acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position);
- acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position);
- acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position);
- acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position);
- }
-
- const auto mtx_out0 = reinterpret_cast<qint8_t *>(out.ptr());
-
- // Store 32x4 output elements
- vst1_qs8(mtx_out0 + 0, acc00_qs8);
- vst1_qs8(mtx_out0 + 8, acc01_qs8);
- vst1_qs8(mtx_out0 + 16, acc02_qs8);
- vst1_qs8(mtx_out0 + 24, acc03_qs8);
- vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8);
- vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8);
- vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8);
- vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8);
- vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8);
- vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8);
- vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8);
- vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8);
- vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8);
- vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8);
- vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8);
- vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8);
- },
- ina, inb, out);
-}
-
-template <bool multiply_alpha>
-void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha)
-{
- const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type());
- const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type());
- const size_t out_stride2 = out_stride1 * 2;
- const size_t out_stride3 = out_stride1 * 3;
- const int num_elems_matrix_b_x = input1->info()->dimension(0);
- const int fixed_point_position = input0->info()->fixed_point_position();
- const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
- ARM_COMPUTE_UNUSED(alpha_qs16);
-
- // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
- Window win_a(window);
- win_a.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));
-
- Window win_b;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
- if(input1->info()->num_dimensions() >= 3)
- {
- win_b = window;
- }
- // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix
- win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));
- win_b.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Iterator ina(input0, win_a);
- Iterator inb(input1, win_b);
- Iterator out(output, window);
-
- // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW
- // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 8x4 elements per iteration
- // All the values needed for computing a single 8x4 block will be read from consecutive memory positions
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto mtx_a0 = reinterpret_cast<const qint16_t *>(ina.ptr());
- auto mtx_b0 = reinterpret_cast<const qint16_t *>(inb.ptr());
- auto mtx_b1 = mtx_b0 + in_b_stride;
-
- qint32x4_t acc00_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc10_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc20_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc30_qs32 = vdupq_n_qs32(0);
-
- qint32x4_t acc01_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc11_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc21_qs32 = vdupq_n_qs32(0);
- qint32x4_t acc31_qs32 = vdupq_n_qs32(0);
-
- // This for loop performs 1 accumulation
- for(int k = 0; k <= (num_elems_matrix_b_x - 8); k += 8)
- {
- const qint16x4_t a0 = vld1_dup_qs16(mtx_a0 + 0);
- const qint16x4_t a1 = vld1_dup_qs16(mtx_a0 + 1);
- const qint16x4_t a2 = vld1_dup_qs16(mtx_a0 + 2);
- const qint16x4_t a3 = vld1_dup_qs16(mtx_a0 + 3);
-
- const qint16x4_t b00 = vld1_qs16(mtx_b0 + 0);
- const qint16x4_t b01 = vld1_qs16(mtx_b0 + 4);
-
- acc00_qs32 = vqmlal_qs16(acc00_qs32, b00, a0, fixed_point_position);
- acc10_qs32 = vqmlal_qs16(acc10_qs32, b00, a1, fixed_point_position);
- acc20_qs32 = vqmlal_qs16(acc20_qs32, b00, a2, fixed_point_position);
- acc30_qs32 = vqmlal_qs16(acc30_qs32, b00, a3, fixed_point_position);
- acc01_qs32 = vqmlal_qs16(acc01_qs32, b01, a0, fixed_point_position);
- acc11_qs32 = vqmlal_qs16(acc11_qs32, b01, a1, fixed_point_position);
- acc21_qs32 = vqmlal_qs16(acc21_qs32, b01, a2, fixed_point_position);
- acc31_qs32 = vqmlal_qs16(acc31_qs32, b01, a3, fixed_point_position);
-
- mtx_a0 += 4;
- mtx_b0 += 8;
- mtx_b1 += 8;
- }
-
- // Convert back to qint16x4_t and saturate
- qint16x4_t acc00_qs16 = vqmovn_qs32(acc00_qs32);
- qint16x4_t acc10_qs16 = vqmovn_qs32(acc10_qs32);
- qint16x4_t acc20_qs16 = vqmovn_qs32(acc20_qs32);
- qint16x4_t acc30_qs16 = vqmovn_qs32(acc30_qs32);
-
- qint16x4_t acc01_qs16 = vqmovn_qs32(acc01_qs32);
- qint16x4_t acc11_qs16 = vqmovn_qs32(acc11_qs32);
- qint16x4_t acc21_qs16 = vqmovn_qs32(acc21_qs32);
- qint16x4_t acc31_qs16 = vqmovn_qs32(acc31_qs32);
-
- // Multiply by the weight of the matrix product (alpha)
- if(multiply_alpha)
- {
- acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
- acc10_qs16 = vqmul_qs16(acc10_qs16, alpha_qs16, fixed_point_position);
- acc20_qs16 = vqmul_qs16(acc20_qs16, alpha_qs16, fixed_point_position);
- acc30_qs16 = vqmul_qs16(acc30_qs16, alpha_qs16, fixed_point_position);
- acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
- acc11_qs16 = vqmul_qs16(acc11_qs16, alpha_qs16, fixed_point_position);
- acc21_qs16 = vqmul_qs16(acc21_qs16, alpha_qs16, fixed_point_position);
- acc31_qs16 = vqmul_qs16(acc31_qs16, alpha_qs16, fixed_point_position);
- }
-
- const auto mtx_out0 = reinterpret_cast<qint16_t *>(out.ptr());
-
- // Store 8x4 output elements
- vst1_qs16(mtx_out0 + 0, acc00_qs16);
- vst1_qs16(mtx_out0 + 4, acc01_qs16);
- vst1_qs16(mtx_out0 + out_stride1 + 0, acc10_qs16);
- vst1_qs16(mtx_out0 + out_stride1 + 4, acc11_qs16);
- vst1_qs16(mtx_out0 + out_stride2 + 0, acc20_qs16);
- vst1_qs16(mtx_out0 + out_stride2 + 4, acc21_qs16);
- vst1_qs16(mtx_out0 + out_stride3 + 0, acc30_qs16);
- vst1_qs16(mtx_out0 + out_stride3 + 4, acc31_qs16);
- },
- ina, inb, out);
-}
-
inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
{
ARM_COMPUTE_UNUSED(alpha);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8, DataType::QS16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
if(!is_interleaved)
{
@@ -1428,7 +822,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
}
}
else
@@ -1467,7 +860,6 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
}
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
}
}
@@ -1492,16 +884,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
num_elems_processed_per_iteration_x = 16;
break;
}
- case DataType::QS8:
- {
- num_elems_processed_per_iteration_x = 32;
- break;
- }
- case DataType::QS16:
- {
- num_elems_processed_per_iteration_x = 16;
- break;
- }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
@@ -1539,16 +921,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
num_elems_processed_per_iteration_x = 8;
break;
}
- case DataType::QS8:
- {
- num_elems_processed_per_iteration_x = 32;
- break;
- }
- case DataType::QS16:
- {
- num_elems_processed_per_iteration_x = 8;
- break;
- }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
@@ -1638,18 +1010,6 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf
vector_matrix_multiply_f32<false>(_input0, _input1, _output, window, info, _alpha);
break;
}
- case DataType::QS8:
- {
- multiply_alpha ? vector_matrix_multiply_qs8<true>(_input0, _input1, _output, window, info, _alpha) :
- vector_matrix_multiply_qs8<false>(_input0, _input1, _output, window, info, _alpha);
- break;
- }
- case DataType::QS16:
- {
- multiply_alpha ? vector_matrix_multiply_qs16<true>(_input0, _input1, _output, window, info, _alpha) :
- vector_matrix_multiply_qs16<false>(_input0, _input1, _output, window, info, _alpha);
- break;
- }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
@@ -1675,18 +1035,6 @@ void NEGEMMMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &inf
matrix_matrix_multiply_f32<false>(_input0, _input1, _output, window, _alpha);
break;
}
- case DataType::QS8:
- {
- multiply_alpha ? matrix_matrix_multiply_qs8<true>(_input0, _input1, _output, window, _alpha) :
- matrix_matrix_multiply_qs8<false>(_input0, _input1, _output, window, _alpha);
- break;
- }
- case DataType::QS16:
- {
- multiply_alpha ? matrix_matrix_multiply_qs16<true>(_input0, _input1, _output, window, _alpha) :
- matrix_matrix_multiply_qs16<false>(_input0, _input1, _output, window, _alpha);
- break;
- }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index c1e975e77e..8588f43edf 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -177,7 +177,6 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
{
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 5d6163d583..4517f46139 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -54,17 +54,15 @@ TensorShape get_output_shape(const ITensorInfo *input)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
- DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -102,7 +100,7 @@ void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type());
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 86e3fd7a84..f03bc49ed3 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -24,7 +24,6 @@
#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Size2D.h"
@@ -47,9 +46,8 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
@@ -90,7 +88,6 @@ inline void linearize_volume(const uint8_t *const in_ptr,
int input_stride_x,
int input_stride_y,
int input_stride_z,
- int fixed_point_position,
int pad_value,
int dilation_x,
int dilation_y)
@@ -171,18 +168,7 @@ inline void linearize_volume(const uint8_t *const in_ptr,
// Append 1 if the convolution layer has biases
if(has_bias)
{
- if(std::is_same<T, qint8_t>::value)
- {
- *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
- }
- else if(std::is_same<T, qint16_t>::value)
- {
- *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
- }
- else
- {
- *out_ptr = static_cast<T>(1);
- }
+ *out_ptr = static_cast<T>(1);
}
}
} // namespace
@@ -251,7 +237,6 @@ void NEIm2ColKernel::run_generic(const Window &window)
input_stride_x,
input_stride_y,
input_stride_z,
- _input->info()->fixed_point_position(),
offset,
_dilation.x(),
_dilation.y());
@@ -294,18 +279,7 @@ void NEIm2ColKernel::run_reduced(const Window &window)
// Add bias
if(_has_bias)
{
- if(std::is_same<T, qint8_t>::value)
- {
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
- }
- else if(std::is_same<T, qint16_t>::value)
- {
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
- }
- else
- {
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
- }
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = static_cast<T>(1);
}
}
while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
@@ -366,12 +340,6 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
_func = &NEIm2ColKernel::run_reduced<float16_t>;
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- _func = &NEIm2ColKernel::run_reduced<qint8_t>;
- break;
- case DataType::QS16:
- _func = &NEIm2ColKernel::run_reduced<qint16_t>;
- break;
case DataType::QASYMM8:
_func = &NEIm2ColKernel::run_reduced<qasymm8_t>;
break;
@@ -392,12 +360,6 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
_func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<float16_t, false> : &NEIm2ColKernel::run_generic<float16_t, true>;
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::QS8:
- _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint8_t, false> : &NEIm2ColKernel::run_generic<qint8_t, true>;
- break;
- case DataType::QS16:
- _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qint16_t, false> : &NEIm2ColKernel::run_generic<qint16_t, true>;
- break;
case DataType::QASYMM8:
_func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_generic<qasymm8_t, false> : &NEIm2ColKernel::run_generic<qasymm8_t, true>;
break;
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 91776d8100..ed037832af 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -103,7 +103,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type(), input->fixed_point_position());
+ auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum);
diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index 434f4eb3e9..d93dc09ff9 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
@@ -68,7 +68,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
TensorShape output_shape = compute_min_max_shape(input);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+ auto_init_if_empty(*output, output_shape, 1, input->data_type());
constexpr unsigned int num_elems_processed_per_iteration = 1;
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 776cb27d7a..253a93f196 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,26 +39,17 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, input_squared);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
- if(is_data_type_fixed_point(input->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared);
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input);
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input);
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input);
- }
-
// Checks performed when output is configured
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
@@ -162,44 +153,6 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
}
break;
}
- case DataType::QS8:
- {
- switch(norm_info.type())
- {
- case NormType::IN_MAP_1D:
- _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, false>;
- break;
- case NormType::IN_MAP_2D:
- // Normalize over X and Y
- _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, true>;
- break;
- case NormType::CROSS_MAP:
- _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
- break;
- default:
- break;
- }
- break;
- }
- case DataType::QS16:
- {
- switch(norm_info.type())
- {
- case NormType::IN_MAP_1D:
- _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, false>;
- break;
- case NormType::IN_MAP_2D:
- // Normalize over X and Y
- _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, true>;
- break;
- case NormType::CROSS_MAP:
- _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
- break;
- default:
- break;
- }
- break;
- }
default:
ARM_COMPUTE_ERROR("NOT SUPPORTED!");
}
@@ -306,105 +259,6 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
}
}
-template <DataType dt, unsigned int dim, bool do_2D_norm>
-void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
-{
- Iterator input(_input, window);
- Iterator input_squared(_input_squared, window);
- Iterator output(_output, window);
-
- const int dim_y = 1;
- const int radius = _norm_info.norm_size() / 2;
- const int total_size = _input->info()->dimension(dim) - 1;
- const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
- // We account padding across X only and we iterate over rows
- const int min_left = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
- const int max_right = (dim == 2) ? total_size : total_size + border_size().left;
- const int min_top = 0;
- const int max_bottom = _input->info()->dimension(dim_y) - 1;
-
- const int fixed_point_position = _input->info()->fixed_point_position();
-
- if(dt == DataType::QS8)
- {
- const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
- const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
- const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- // Accumulate 2D In-Map values
- qint8x16_t accu = vdupq_n_qs8(0);
- for(int j = first_row; j <= last_row; ++j)
- {
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
- {
- accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
- }
- }
-
- // Normalize
- const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
- const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
- const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
- }
- else if(dt == DataType::QS16)
- {
- const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position);
- const qint16x8_t beta_vec = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position);
- const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int current_slice = id[dim];
- const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- const int first_slice = std::max(current_slice - radius, min_left);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- // Accumulate 2D In-Map values
- qint16x8_t accu = vdupq_n_qs16(0);
- for(int j = first_row; j <= last_row; ++j)
- {
- // Compute row displacement
- const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
- const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
- for(int i = first_slice; i <= last_slice; ++i)
- {
- accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast<const qint16_t *>(input_squared_ptr + i * input_squared_stride)));
- }
- }
-
- // Normalize
- const qint16x8_t accu_scale = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position);
- const qint16x8_t normalized = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position);
- const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), normalized, fixed_point_position);
- vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), normalized_pixel);
- },
- input, input_squared, output);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
-}
-
Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index ae1d48cc69..e9bc8effc6 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp
@@ -45,8 +45,8 @@ namespace
{
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8,
- DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+ DataType::U16, DataType::S16,
DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm.num_dimensions() == 3 && !(perm[0] == 2 && perm[1] == 0 && perm[2] == 1) && !(perm[0] == 1 && perm[1] == 2 && perm[2] == 0)),
@@ -59,7 +59,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index 193ca3799c..0ec7e823a1 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
@@ -61,9 +61,9 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
ARM_COMPUTE_UNUSED(overflow_policy);
ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
"Output can only be U8 if both inputs are U8");
@@ -71,14 +71,6 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i
ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- if(is_data_type_fixed_point(input1->data_type()) || is_data_type_fixed_point(input2->data_type()) || is_data_type_fixed_point(output->data_type()))
- {
- // Check that all data types are the same and all fixed-point positions are the same
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2, output);
- // Check if scale is representable in fixed-point with the provided settings
- ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(scale, input1);
- }
-
if(std::abs(scale - scale255_constant) < 0.00001f)
{
ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
@@ -120,11 +112,6 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
{
set_format_if_unknown(*output, Format::F16);
}
- else if(input1->data_type() == DataType::QS8 && input2->data_type() == DataType::QS8)
- {
- set_data_type_if_unknown(*output, DataType::QS8);
- set_fixed_point_position_if_zero(*output, input1->fixed_point_position());
- }
}
// Configure kernel window
@@ -220,105 +207,6 @@ void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict in
}
template <bool is_scale255, bool is_sat>
-void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
-{
- const auto output = static_cast<qint8_t *__restrict>(output_ptr);
-
- const qint8x16_t ta1 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input1_ptr));
- const qint8x16_t ta2 = vld1q_qs8(static_cast<const qint8_t *__restrict>(input2_ptr));
-
- if(is_scale255)
- {
- qint16x8_t tmp1_high = vmovl_s8(vget_high_s8(ta1));
- qint16x8_t tmp1_low = vmovl_s8(vget_low_s8(ta1));
- const qint16x8_t tmp2_high = vmovl_s8(vget_high_s8(ta2));
- const qint16x8_t tmp2_low = vmovl_s8(vget_low_s8(ta2));
-
- const float32x4x2_t scale255_f32 =
- {
- {
- scale255_constant_f32q,
- scale255_constant_f32q
- }
- };
- const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
-
- tmp1_high = vmulq_qs16(tmp1_high, tmp2_high, fixed_point_position);
- tmp1_low = vmulq_qs16(tmp1_low, tmp2_low, fixed_point_position);
- tmp1_high = vmulq_qs16(tmp1_high, scale255, fixed_point_position);
- tmp1_low = vmulq_qs16(tmp1_low, scale255, fixed_point_position);
-
- if(is_sat)
- {
- vst1q_qs8(output, vcombine_s8(vqmovn_s16(tmp1_low), vqmovn_s16(tmp1_high)));
- }
- else
- {
- vst1q_qs8(output, vcombine_s8(vmovn_s16(tmp1_low), vmovn_s16(tmp1_high)));
- }
- }
- else
- {
- const qint8x16_t vn = vdupq_n_s8(-n);
- qint8x16_t res = ta2;
-
- if(is_sat)
- {
- res = vqshlq_s8(vqmulq_qs8(ta1, res, fixed_point_position), vn);
- }
- else
- {
- res = vshlq_s8(vmulq_qs8(ta1, res, fixed_point_position), vn);
- }
- vst1q_qs8(output, res);
- }
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_QS16_QS16_QS16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position)
-{
- const qint16x8x2_t ta1 = vld2q_qs16(static_cast<const qint16_t *__restrict>(input1_ptr));
- qint16x8x2_t res = vld2q_qs16(static_cast<const qint16_t *__restrict>(input2_ptr));
-
- if(is_scale255)
- {
- const float32x4x2_t scale255_f32 =
- {
- {
- scale255_constant_f32q,
- scale255_constant_f32q
- }
- };
- const qint16x8_t scale255 = vqcvtq_qs16_f32(scale255_f32, fixed_point_position);
- if(is_sat)
- {
- res.val[0] = vqmulq_qs16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
- res.val[1] = vqmulq_qs16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
- }
- else
- {
- res.val[0] = vmulq_qs16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), scale255, fixed_point_position);
- res.val[1] = vmulq_qs16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), scale255, fixed_point_position);
- }
- }
- else
- {
- const qint16x8_t vn = vdupq_n_s16(-n);
- if(is_sat)
- {
- res.val[0] = vqshlq_s16(vqmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
- res.val[1] = vqshlq_s16(vqmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
- }
- else
- {
- res.val[0] = vshlq_s16(vmulq_qs16(ta1.val[0], res.val[0], fixed_point_position), vn);
- res.val[1] = vshlq_s16(vmulq_qs16(ta1.val[1], res.val[1], fixed_point_position), vn);
- }
- }
- vst2q_s16(static_cast<qint16_t *__restrict>(output_ptr), res);
-}
-
-template <bool is_scale255, bool is_sat>
inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
{
int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
@@ -529,7 +417,7 @@ void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict
} // namespace
NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
- : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
+ : _func_float(nullptr), _func_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
{
}
@@ -550,7 +438,6 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
_scale = scale;
_scale_exponent = 0;
_func_int = nullptr;
- _func_q_int = nullptr;
_func_float = nullptr;
bool is_scale_255 = false;
@@ -630,28 +517,6 @@ void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITe
_func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
}
}
- else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output)
- {
- if(is_scale_255)
- {
- _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<true, true> : &mul_QS8_QS8_QS8_n<true, false>;
- }
- else
- {
- _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n<false, true> : &mul_QS8_QS8_QS8_n<false, false>;
- }
- }
- else if(DataType::QS16 == dt_input1 && DataType::QS16 == dt_input2 && DataType::QS16 == dt_output)
- {
- if(is_scale_255)
- {
- _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<true, true> : &mul_QS16_QS16_QS16_n<true, false>;
- }
- else
- {
- _func_q_int = is_sat ? &mul_QS16_QS16_QS16_n<false, true> : &mul_QS16_QS16_QS16_n<false, false>;
- }
- }
else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
{
_func_float = &mul_F16_F16_F16_n<false, false>;
@@ -724,17 +589,6 @@ void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo
},
input1, input2, output);
}
- else if(_func_q_int != nullptr)
- {
- int fixed_point_position = _input1->info()->fixed_point_position();
- execute_window_loop(collapsed, [&](const Coordinates & id)
- {
- (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position);
- collapsed.slide_window_slice_3D(slice_input1);
- collapsed.slide_window_slice_3D(slice_input2);
- },
- input1, input2, output);
- }
else
{
ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 7877cf5cc0..e586b72d30 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -25,7 +25,6 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/FixedPoint.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/NEON/NEAsymm.h"
@@ -79,32 +78,6 @@ inline float calculate_avg_scale(const Coordinates &id, const int pool_size_x, c
return 1.f / ((end_y - start_y) * (end_x - start_x));
}
-inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
- int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
-{
- static const std::array<qint8_t, 10> scale_values_q8 =
- { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } };
- const int start_x = id.x() * stride_x - pad_x;
- const int start_y = id.y() * stride_y - pad_y;
- const int end_x = std::min(start_x + pool_size, upper_bound_w);
- const int end_y = std::min(start_y + pool_size, upper_bound_h);
- const int val = ((end_y - start_y) * (end_x - start_x));
- return sshr_qs8(scale_values_q8[val], (7 - fixed_point_position));
-}
-
-inline qint16_t calculate_avg_scale_q16(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h,
- int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position)
-{
- static std::array<qint16_t, 10> scale_values_q16 =
- { { 0x0, 0x0, 0x4000, 0x2AAB, 0x2000, 0x199A, 0x1555, 0x1249, 0x1000, 0xE38 } };
- const int start_x = id.x() * stride_x - pad_x;
- const int start_y = id.y() * stride_y - pad_y;
- const int end_x = std::min(start_x + pool_size, upper_bound_w);
- const int end_y = std::min(start_y + pool_size, upper_bound_h);
- const int val = ((end_y - start_y) * (end_x - start_x));
- return sshr_qs16(scale_values_q16[val], (15 - fixed_point_position));
-}
-
template <bool exclude_padding>
inline void scale_vector_s16x8(uint16x8_t &v, const Coordinates &id, int id_offset, int step,
const int pool_size, const int upper_bound_w, const int upper_bound_h,
@@ -163,22 +136,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
int pool_stride_y = 0;
PoolingType pool_type = pool_info.pool_type();
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
- const bool exclude_padding = pool_info.exclude_padding();
std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
static const std::set<int> supported_pool_sizes = { 2, 3 };
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON((supported_pool_sizes.find(pool_size_x) == supported_pool_sizes.end()) && ((input->data_type() != DataType::F32) && (input->data_type() != DataType::QASYMM8))
&& (pool_type != PoolingType::MAX));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_fixed_point(input->data_type()) && pool_stride_x > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(exclude_padding && is_data_type_fixed_point(input->data_type()));
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
|| (output->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
@@ -236,22 +205,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
{
switch(input->data_type())
{
- case DataType::QS8:
- num_elems_read_per_iteration = 16;
- switch(pool_size_x)
- {
- case 2:
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
- break;
- case 3:
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
- break;
- default:
- break;
- }
- break;
case DataType::QASYMM8:
if(is_nhwc)
{
@@ -274,22 +227,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
break;
}
break;
- case DataType::QS16:
- num_elems_read_per_iteration = 8;
- switch(pool_size_x)
- {
- case 2:
- num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 4 : 7;
- break;
- case 3:
- num_elems_horizontal_window = (pool_stride_x == 2) ? 4 : 8;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 3 : 6;
- break;
- default:
- break;
- }
- break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
if(is_nhwc)
@@ -462,64 +399,7 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
const DataType data_type = input->info()->data_type();
const bool is_nchw = data_layout == DataLayout::NCHW;
- // Select appropriate function
- if(data_type == DataType::QS8)
- {
- if(_is_square)
- {
- switch(pool_size_x)
- {
- case 2:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_q8_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 3:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_q8_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- default:
- switch(pool_type)
- {
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- }
- }
- else
- {
- switch(pool_type)
- {
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q8_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- }
- }
- else if(data_type == DataType::QASYMM8)
+ if(data_type == DataType::QASYMM8)
{
if(pool_size_x == 2 && pool_stride_x < 3 && _is_square)
{
@@ -606,62 +486,6 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
}
}
}
- else if(data_type == DataType::QS16)
- {
- if(_is_square)
- {
- switch(pool_size_x)
- {
- case 2:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling2_q16_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- case 3:
- switch(pool_type)
- {
- case PoolingType::AVG:
- _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::AVG>;
- break;
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::pooling3_q16_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- default:
- switch(pool_type)
- {
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- break;
- }
- }
- else
- {
- switch(pool_type)
- {
- case PoolingType::MAX:
- _func = &NEPoolingLayerKernel::poolingMxN_q16_nchw<PoolingType::MAX>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported pooling type!");
- }
- }
- }
else if(data_type == DataType::F16)
{
if(_is_square)
@@ -1022,71 +846,6 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
INEKernel::configure(win_config.second);
}
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- const int fixed_point_position = _input->info()->fixed_point_position();
- constexpr int pool_size = 2;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
- const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
- const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
- qint8x8_t lower_res = {};
- qint8x8_t upper_res = {};
- if(pooling_type == PoolingType::AVG)
- {
- // Calculate scale
- const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
- const qint8x8_t scale_vec = vdup_n_qs8(scale);
-
- // Perform pooling
- const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data);
- lower_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position);
- if(pool_stride_x == 1)
- {
- const qint8x16_t sum_data_shifted = vextq_s8(sum_data, sum_data, 1);
- upper_res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data_shifted), vget_high_s8(sum_data_shifted)), scale_vec, fixed_point_position);
- }
- }
- else
- {
- const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data);
- lower_res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data));
- if(pool_stride_x == 1)
- {
- const qint8x16_t max_data_shifted = vextq_s8(max_data, max_data, 1);
- upper_res = vpmax_s8(vget_low_s8(max_data_shifted), vget_high_s8(max_data_shifted));
- }
- }
- if(pool_stride_x == 1)
- {
- const qint8x8x2_t res = { { lower_res, upper_res } };
- vst2_s8(reinterpret_cast<qint8_t *>(output.ptr()), res);
- }
- else
- {
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), lower_res);
- }
- },
- input, output);
-}
-
template <PoolingType pooling_type, bool exclude_padding>
void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, const Window &window)
{
@@ -1201,71 +960,6 @@ void NEPoolingLayerKernel::pooling2_qasymm8_nchw(const Window &window_input, con
input, output);
}
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling2_q16_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- const int fixed_point_position = _input->info()->fixed_point_position();
- constexpr int pool_size = 2;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
- const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
- const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
- const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
- qint16x4_t lower_res = {};
- qint16x4_t upper_res = {};
- if(pooling_type == PoolingType::AVG)
- {
- // Calculate scale
- const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
- const qint16x4_t scale_vec = vdup_n_qs16(scale);
-
- // Perform pooling
- const qint16x8_t sum_data = vqaddq_qs16(top_data, bottom_data);
- lower_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data), vget_high_s16(sum_data)), scale_vec, fixed_point_position);
- if(pool_stride_x == 1)
- {
- const qint16x8_t sum_data_shifted = vextq_s16(sum_data, sum_data, 1);
- upper_res = vqmul_qs16(vpadd_s16(vget_low_s16(sum_data_shifted), vget_high_s16(sum_data_shifted)), scale_vec, fixed_point_position);
- }
- }
- else
- {
- const qint16x8_t max_data = vmaxq_s16(top_data, bottom_data);
- lower_res = vpmax_s16(vget_low_s16(max_data), vget_high_s16(max_data));
- if(pool_stride_x == 1)
- {
- const qint16x8_t max_data_shifted = vextq_s16(max_data, max_data, 1);
- upper_res = vpmax_s16(vget_low_s16(max_data_shifted), vget_high_s16(max_data_shifted));
- }
- }
- if(pool_stride_x == 1)
- {
- const qint16x4x2_t res = { { lower_res, upper_res } };
- vst2_s16(reinterpret_cast<qint16_t *>(output.ptr()), res);
- }
- else
- {
- vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), lower_res);
- }
- },
- input, output);
-}
-
template <PoolingType pooling_type, bool exclude_padding>
void NEPoolingLayerKernel::pooling3_f16_nchw(const Window &window_input, const Window &window)
{
@@ -1461,82 +1155,6 @@ void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const W
input, output);
}
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q8_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- const int fixed_point_position = _input->info()->fixed_point_position();
- constexpr int pool_size = 3;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
- const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
- const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto top_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_top_ptr + input.offset()));
- const auto middle_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_middle_ptr + input.offset()));
- const auto bottom_data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input_bottom_ptr + input.offset()));
- qint8x8_t res = {};
- if(pooling_type == PoolingType::AVG)
- {
- // Calculate scale
- const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
-
- // Perform pooling for stride 2
- const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data);
- const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1);
- const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2);
- const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3);
- if(pool_stride_x == 2)
- {
- const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } };
- static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
- const qint8x8_t scale_vec = vdup_n_qs8(scale);
- res = vtbl2_s8(table, lookup_val);
- res = vqmul_qs8(res, scale_vec, fixed_point_position);
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
- }
- else
- {
- const qint8x16_t scale_vec = vdupq_n_qs8(scale);
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmulq_qs8(final_sum, scale_vec, fixed_point_position));
- }
- }
- else
- {
- const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data);
- const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1);
- const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2);
- const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3);
-
- if(pool_stride_x == 2)
- {
- const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } };
- static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
- res = vtbl2_s8(table, lookup_val);
- vst1_qs8(reinterpret_cast<qint8_t *>(output.ptr()), res);
- }
- else
- {
- vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), final_max);
- }
- }
- },
- input, output);
-}
-
template <PoolingType pooling_type, bool exclude_padding>
void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, const Window &window)
{
@@ -1657,77 +1275,6 @@ void NEPoolingLayerKernel::pooling3_qasymm8_nchw(const Window &window_input, con
input, output);
}
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::pooling3_q16_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- const int fixed_point_position = _input->info()->fixed_point_position();
- constexpr int pool_size = 3;
- const int pool_pad_right = _pool_info.pad_stride_info().pad_right();
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- const int pool_pad_bottom = _pool_info.pad_stride_info().pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
- const int upper_bound_w = _input->info()->dimension(0) + pool_pad_right;
- const int upper_bound_h = _input->info()->dimension(1) + pool_pad_bottom;
-
- const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto top_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_top_ptr + input.offset()));
- const auto middle_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_middle_ptr + input.offset()));
- const auto bottom_data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input_bottom_ptr + input.offset()));
-
- if(pooling_type == PoolingType::AVG)
- {
- // Calculate scale
- const qint16_t scale = calculate_avg_scale_q16(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y, fixed_point_position);
-
- // Perform pooling for stride 2
- const qint16x8_t sum_data = vqaddq_qs16(vqaddq_qs16(top_data, bottom_data), middle_data);
- const qint16x8_t sum_data2 = vextq_s16(sum_data, sum_data, 1);
- const qint16x8_t sum_data3 = vextq_s16(sum_data, sum_data, 2);
- const qint16x8_t final_sum = vqaddq_qs16(vqaddq_qs16(sum_data, sum_data2), sum_data3);
- if(pool_stride_x == 2)
- {
- const qint16x4_t tmp = { vgetq_lane_s16(final_sum, 0), vgetq_lane_s16(final_sum, 2), vgetq_lane_s16(final_sum, 4), vgetq_lane_s16(final_sum, 6) };
- const qint16x4_t scale_vec = vdup_n_qs16(scale);
- vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmul_qs16(tmp, scale_vec, fixed_point_position));
- }
- else
- {
- const qint16x8_t scale_vec = vdupq_n_qs16(scale);
- vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmulq_qs16(final_sum, scale_vec, fixed_point_position));
- }
- }
- else
- {
- const qint16x8_t max_data = vmaxq_s16(vmaxq_s16(top_data, bottom_data), middle_data);
- const qint16x8_t max_data2 = vextq_s16(max_data, max_data, 1);
- const qint16x8_t max_data3 = vextq_s16(max_data, max_data, 2);
- const qint16x8_t final_max = vmaxq_s16(vmaxq_s16(max_data, max_data2), max_data3);
-
- if(pool_stride_x == 2)
- {
- const qint16x4_t tmp = { vgetq_lane_s16(final_max, 0), vgetq_lane_s16(final_max, 2), vgetq_lane_s16(final_max, 4), vgetq_lane_s16(final_max, 6) };
- vst1_qs16(reinterpret_cast<qint16_t *>(output.ptr()), tmp);
- }
- else
- {
- vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), final_max);
- }
- }
- },
- input, output);
-}
-
template <PoolingType pooling_type, bool exclude_padding>
void NEPoolingLayerKernel::pooling3_f32_nchw(const Window &window_input, const Window &window)
{
@@ -1879,110 +1426,6 @@ void NEPoolingLayerKernel::pooling7_f32_nchw(const Window &window_input, const W
input, output);
}
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q8_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
- const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- qint8x16_t vres = {};
- qint8_t res = {};
-
- //PoolingType::MAX
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 16); x += 16)
- {
- const qint8x16_t data = vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
- (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
- vres = vmaxq_s8(vres, data);
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- qint8_t data = *(reinterpret_cast<const qint8_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
- res = std::max(res, data);
- }
- }
- //Reduce
- const qint8x8_t half_vres = vpmax_s8(vget_low_s8(vres), vget_high_s8(vres));
- res = std::max(res, vget_lane_s8(half_vres, 0));
- res = std::max(res, vget_lane_s8(half_vres, 1));
- res = std::max(res, vget_lane_s8(half_vres, 2));
- res = std::max(res, vget_lane_s8(half_vres, 3));
- res = std::max(res, vget_lane_s8(half_vres, 4));
- res = std::max(res, vget_lane_s8(half_vres, 5));
- res = std::max(res, vget_lane_s8(half_vres, 6));
- res = std::max(res, vget_lane_s8(half_vres, 7));
-
- // Store result
- *(reinterpret_cast<qint8_t *>(output.ptr())) = res;
- },
- input, output);
-}
-
-template <PoolingType pooling_type>
-void NEPoolingLayerKernel::poolingMxN_q16_nchw(const Window &window_input, const Window &window)
-{
- Iterator input(_input, window_input);
- Iterator output(_output, window);
-
- const int pool_size_x = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().x() : _pool_info.pool_size().width;
- const int pool_size_y = _pool_info.is_global_pooling() ? _input->info()->tensor_shape().y() : _pool_info.pool_size().height;
- const int pool_pad_top = _pool_info.pad_stride_info().pad_top();
- const int pool_pad_left = _pool_info.pad_stride_info().pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- qint16x8_t vres = {};
- qint16_t res = {};
-
- //PoolingType::MAX
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 8); x += 8)
- {
- const qint16x8_t data = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() +
- (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
- vres = vmaxq_s16(vres, data);
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- qint16_t data = *(reinterpret_cast<const qint16_t *>(input.ptr() + (x - pool_pad_left) * _input->info()->strides_in_bytes().x() + (y - pool_pad_top) * _input->info()->strides_in_bytes().y()));
- res = std::max(res, data);
- }
- }
- //Reduce
- const qint16x4_t half_vres = vpmax_s16(vget_low_s16(vres), vget_high_s16(vres));
- res = std::max(res, vget_lane_s16(half_vres, 0));
- res = std::max(res, vget_lane_s16(half_vres, 1));
- res = std::max(res, vget_lane_s16(half_vres, 2));
- res = std::max(res, vget_lane_s16(half_vres, 3));
-
- // Store result
- *(reinterpret_cast<qint16_t *>(output.ptr())) = res;
- },
- input, output);
-}
-
template <PoolingType pooling_type, bool exclude_padding>
void NEPoolingLayerKernel::poolingMxN_f16_nchw(const Window &window_input, const Window &window)
{
@@ -2688,8 +2131,6 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
unsigned int window_x_inc = 0;
switch(_input->info()->data_type())
{
- case DataType::QS8:
- case DataType::QS16:
case DataType::F16:
{
window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index ee23e76c5c..b49400ab7d 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -54,7 +54,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *min_max)
{
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8, 0);
+ auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::U8);
constexpr unsigned int num_elems_processed_per_iteration = 8;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index a209a523d3..4d908db77b 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,7 +51,7 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const IROIArray *r
// Output auto inizialitation if not yet initialized
TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->num_values());
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 30d42fa25f..30f21bbf33 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -134,7 +134,7 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
const TensorShape output_shape = calculate_output_shape(input->tensor_shape(), axis);
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->fixed_point_position());
+ auto_init_if_empty(*output, output_shape, 1, input->data_type());
unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->data_type());
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 45ba68d9fa..d6f470445f 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -59,11 +59,10 @@ inline void reshape_tensor(const Window &window, const ITensor *input, ITensor *
void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QS8, DataType::U16, DataType::S16, DataType::QS16,
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
DataType::U32, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size());
_input = input;
@@ -94,12 +93,10 @@ void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
case DataType::U8:
case DataType::S8:
case DataType::QASYMM8:
- case DataType::QS8:
reshape_tensor<uint8_t>(window, _input, _output);
break;
case DataType::U16:
case DataType::S16:
- case DataType::QS16:
case DataType::F16:
reshape_tensor<uint16_t>(window, _input, _output);
break;
diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index d91efd267f..9946f002de 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
@@ -194,56 +194,7 @@ T sqadd(T a, T b);
template <typename T>
T sqsub(T a, T b);
template <typename T>
-T sqmul(T a, T b, int fixed_point_position);
-
-#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU) \
- inline vec_8_byte_t<TYPET> vqsub(vec_8_byte_t<TYPET> a, vec_8_byte_t<TYPET> b) \
- { \
- return vqsub_##TAGT(a, b); \
- } \
- inline vec_8_byte_t<TYPEU> vqadd(vec_8_byte_t<TYPEU> a, vec_8_byte_t<TYPEU> b) \
- { \
- return vqadd_##TAGU(a, b); \
- } \
- inline vec_16_byte_t<TYPEU> vqadd(vec_16_byte_t<TYPEU> a, vec_16_byte_t<TYPEU> b) \
- { \
- return vqaddq_##TAGU(a, b); \
- } \
- inline vec_8_byte_t<TYPET> vqexp(vec_8_byte_t<TYPET> vec, int fixed_point_position) \
- { \
- return vqexp_q##TAGT(vec, fixed_point_position); \
- } \
- inline auto vmovl(vec_8_byte_t<TYPET> vec)->decltype(vmovl_##TAGT(vec)) \
- { \
- return vmovl_##TAGT(vec); \
- } \
- inline vec_16_byte_t<TYPET> vqrecip(vec_16_byte_t<TYPET> vec, int fixed_point_position) \
- { \
- return vqrecipq_q##TAGT(vec, fixed_point_position); \
- } \
- inline vec_16_byte_t<TYPET> vqmul(vec_16_byte_t<TYPET> a, vec_16_byte_t<TYPET> b, int fixed_point_position) \
- { \
- return vqmulq_q##TAGT(a, b, fixed_point_position); \
- } \
- template <> \
- inline TYPEU sqadd<TYPEU>(TYPEU a, TYPEU b) \
- { \
- return sqadd_q##TAGU(a, b); \
- } \
- inline TYPET sqexp(TYPET val, int fixed_point_position) \
- { \
- return sqexp_q##TAGT(val, fixed_point_position); \
- } \
- template <> \
- inline TYPET sqsub<TYPET>(TYPET a, TYPET b) \
- { \
- return sqsub_q##TAGT(a, b); \
- } \
- template <> \
- inline TYPET sqmul<TYPET>(TYPET a, TYPET b, int fixed_point_position) \
- { \
- return sqmul_q##TAGT(a, b, fixed_point_position); \
- }
+T sqmul(T a, T b);
#define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG) \
inline vec_8_byte_t<TYPE> vadd(vec_8_byte_t<TYPE> a, vec_8_byte_t<TYPE> b) \
@@ -278,9 +229,6 @@ DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16)
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32)
-DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16)
-DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32)
-
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16)
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -373,16 +321,15 @@ namespace
Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
// Validate in case of configured output
if(output.total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
}
@@ -395,7 +342,7 @@ std::pair<Status, Window> validate_and_configure_window_logits_1d_max(ITensorInf
// Softmax across the x dimension
const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1);
// Output auto initialization if not yet initialized
- auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info());
+ auto_init_if_empty(output, output_shape, 1, input.data_type(), input.quantization_info());
// Configure kernel window
const int input_width = input.valid_region().shape.x();
@@ -488,12 +435,6 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output)
case DataType::QASYMM8:
_func = &logits_1d_max<qasymm8_t>;
break;
- case DataType::QS8:
- _func = &logits_1d_max<qint8_t>;
- break;
- case DataType::QS16:
- _func = &logits_1d_max<qint16_t>;
- break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
_func = &logits_1d_max<float16_t>;
@@ -543,11 +484,12 @@ namespace
Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max,
const ITensorInfo &output, const float beta, const ITensorInfo &tmp)
{
+ ARM_COMPUTE_UNUSED(beta);
// Check input
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type());
@@ -555,7 +497,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor
// Check max
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max);
// Check output if configured
@@ -564,19 +505,14 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor
const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info();
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output);
ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization);
}
- // Check beta
- ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type()));
-
// Check tmp if configured
if(tmp.total_size() != 0)
{
const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type();
ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp);
// We could potentially reduce tmp memory if we could predict or make an assumption
// on the maximum number of threads that will run in parallel.
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp);
@@ -727,88 +663,6 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons
in_it, max_it, out_it);
}
-template <typename T, typename U>
-void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp,
- ITensor &out, const float /*beta*/, const Window &window)
-{
- const int start_x = in.info()->valid_region().anchor.x();
- const int input_width = in.info()->valid_region().shape.x();
-
- const int fixed_point_position = in.info()->fixed_point_position();
-
- Iterator in_it(&in, window);
- Iterator max_it(&max, window);
- Iterator out_it(&out, window);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
- vec_16_byte_t<T> vec_sum_inversed;
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = vdup_n<vec_8_byte_t<T>>(max_val);
-
- /* Init sum to zero */
- auto vec_sum = vdup_n<vec_16_byte_t<U>>(0);
-
- /* Loop over row and compute exponentials and sum */
- int i = 0;
- constexpr int vec_size = vec_size_of(vec_sum);
- for(; i <= (input_width - vec_size); i += vec_size)
- {
- auto vec_elements = vld<vec_8_byte_t<T>>(in_ptr + i);
- vec_elements = vqsub(vec_elements, vec_max);
- vec_elements = vqexp(vec_elements, fixed_point_position);
- vec_sum = vqadd(vec_sum, vmovl(vec_elements));
- vst(tmp_ptr + i, vec_elements);
- }
- /* Reduce sum */
- const vec_8_byte_t<U> sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum));
- U sum = reduce_add(sqadd<U>, sum_8_byte);
-
- /* Run remaining elements */
- for(; i < input_width; ++i)
- {
- T element = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position);
- sum = sqadd<U>(sum, element);
- tmp_ptr[i] = element;
- }
-
- const auto qsum = utility::saturate_cast<T>(sum);
- vec_sum_inversed = vqrecip(vdup_n<vec_16_byte_t<T>>(qsum), fixed_point_position);
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int i = 0;
- constexpr int vec_size = vec_size_of(vec_sum_inversed);
- for(; i <= (input_width - vec_size); i += vec_size)
- {
- const auto vec_in = vld<vec_16_byte_t<T>>(tmp_ptr + i);
- const vec_16_byte_t<T> normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position);
- vst(out_ptr + i, normalized_value);
- }
-
- const T sum_inversed = vget_lane<0>(vec_sum_inversed);
-
- /* Run remaining elements */
- for(; i < input_width; ++i)
- {
- out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position);
- }
- }
- },
- in_it, max_it, out_it);
-}
-
template <typename T>
void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp,
ITensor &out, const float beta, const Window &window)
@@ -908,12 +762,6 @@ void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max
case DataType::QASYMM8:
_func = &logits_1d_softmax_qasymm8;
break;
- case DataType::QS8:
- _func = &logits_1d_softmax_fixed_point<qint8_t, qint16_t>;
- break;
- case DataType::QS16:
- _func = &logits_1d_softmax_fixed_point<qint16_t, qint32_t>;
- break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
_func = &logits_1d_softmax_float<float16_t>;
diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 92271378ff..2630159561 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp
@@ -74,7 +74,7 @@ unsigned int num_elems_processed(size_t element_size)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::QS16, DataType::U32, DataType::S32,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16,
DataType::F32);
@@ -84,7 +84,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 3031a87637..f398409b26 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -105,14 +105,13 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
if(biases != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, biases);
ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
@@ -124,7 +123,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
}
return Status{};