diff options
-rw-r--r-- | arm_compute/core/FixedPoint.h | 4 | ||||
-rw-r--r-- | arm_compute/core/FixedPoint.inl | 8 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEFixedPoint.h | 16 | ||||
-rw-r--r-- | arm_compute/core/NEON/NEFixedPoint.inl | 28 | ||||
-rw-r--r-- | arm_compute/runtime/NEON/functions/NEDepthConvert.h | 7 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp | 4 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 4 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEActivationLayerKernel.cpp | 6 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp | 2 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEDepthConvertKernel.cpp | 83 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp | 4 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp | 8 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEIm2ColKernel.cpp | 8 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEDepthConvert.cpp | 7 | ||||
-rw-r--r-- | tests/TensorLibrary.h | 1 | ||||
-rw-r--r-- | tests/validation/FixedPoint.h | 2 | ||||
-rw-r--r-- | tests/validation/NEON/DepthConvert.cpp | 83 | ||||
-rw-r--r-- | tests/validation/TensorOperations.h | 90 |
18 files changed, 237 insertions, 128 deletions
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h index da304c6329..5eb4c55c41 100644 --- a/arm_compute/core/FixedPoint.h +++ b/arm_compute/core/FixedPoint.h @@ -296,7 +296,7 @@ float scvt_f32_qs8(qint8_t a, int fixed_point_position); * * @return The result of the conversion float -> 8 bit fixed point */ -qint8_t scvt_qs8_f32(float a, int fixed_point_position); +qint8_t sqcvt_qs8_f32(float a, int fixed_point_position); /** Convert a 16 bit fixed point to float * @@ -314,7 +314,7 @@ float scvt_f32_qs16(qint16_t a, int fixed_point_position); * * @return The result of the conversion float -> 16 bit fixed point */ -qint16_t scvt_qs16_f32(float a, int fixed_point_position); +qint16_t sqcvt_qs16_f32(float a, int fixed_point_position); /** Scalar saturating move and narrow. * diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl index fab91d6699..fdbc3f0c06 100644 --- a/arm_compute/core/FixedPoint.inl +++ b/arm_compute/core/FixedPoint.inl @@ -366,10 +366,10 @@ inline float scvt_f32_qs8(qint8_t a, int fixed_point_position) return static_cast<float>(a) / (1 << fixed_point_position); } -inline qint8_t scvt_qs8_f32(float a, int fixed_point_position) +inline qint8_t sqcvt_qs8_f32(float a, int fixed_point_position) { // round_nearest_integer(a * 2^(fixed_point_position)) - return static_cast<qint8_t>(a * (1 << fixed_point_position) + 0.5f); + return saturate_convert<float, qint8_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5)); } inline float scvt_f32_qs16(qint16_t a, int fixed_point_position) @@ -377,10 +377,10 @@ inline float scvt_f32_qs16(qint16_t a, int fixed_point_position) return static_cast<float>(a) / (1 << fixed_point_position); } -inline qint16_t scvt_qs16_f32(float a, int fixed_point_position) +inline qint16_t sqcvt_qs16_f32(float a, int fixed_point_position) { // round_nearest_integer(a * 2^(fixed_point_position)) - return static_cast<qint16_t>(a * (1 << fixed_point_position) + 0.5f); + return saturate_convert<float, qint16_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5)); } inline qint8_t sqmovn_qs16(qint16_t a) diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index 660464eb62..e3eb5d4638 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -788,36 +788,36 @@ qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point * @param[in] a Float input vector * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number * - * @return The result of the conversion float -> 8 bit fixed point + * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow */ -qint8x8_t vcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position); +qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position); /** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements * * @param[in] a Float input vector * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number * - * @return The result of the conversion float -> 16 bit fixed point + * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow */ -qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position); +qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position); /** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements * * @param[in] a Float input vector * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number * - * @return The result of the conversion float -> 8 bit fixed point + * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow */ -qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position); +qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position); /** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements * * @param[in] a Float input vector * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number * - * @return The result of the conversion float -> 16 bit fixed point + * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow */ -qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position); +qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position); /** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements * diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index 4f7f44ab3b..92af82cf71 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -236,7 +236,7 @@ inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position) vdupq_n_f32(a), } }; - return vcvtq_qs8_f32(res, fixed_point_position); + return vqcvtq_qs8_f32(res, fixed_point_position); } inline qint16x8_t vdupq_n_qs16(qint16_t a) @@ -809,15 +809,15 @@ inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixe return vqaddq_s32(a, tmp); } -inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position) +inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position) { const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position)); float32x4x2_t res_f32 = { { - vdupq_n_f32(0.5f), - vdupq_n_f32(0.5f) + vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), + vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)) } }; @@ -837,11 +837,11 @@ inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position) return vqmovn_s16(res_s16); } -inline qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position) +inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position) { const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position)); - float32x4_t res_f32 = vdupq_n_f32(0.5f); + float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)); res_f32 = vmlaq_f32(res_f32, a, pow2); @@ -850,17 +850,17 @@ inline qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position) return vqmovn_s32(res_s32); } -inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position) +inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position) { const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position)); float32x4x4_t res_f32 = { { - vdupq_n_f32(0.5f), - vdupq_n_f32(0.5f), - vdupq_n_f32(0.5f), - vdupq_n_f32(0.5f) + vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), + vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), + vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), + vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)) } }; @@ -890,15 +890,15 @@ inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1])); } -inline qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position) +inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position) { const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position)); float32x4x2_t res_f32 = { { - vdupq_n_f32(0.5f), - vdupq_n_f32(0.5f) + vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)), + vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)) } }; diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h index 7c59ce432d..47b3a7e6f6 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h +++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h @@ -52,11 +52,12 @@ public: * U8 -> U16, S16, S32 * U16 -> U8, U32 * S16 -> U8, S32 - * F32 -> QS8 + * QS16 -> F32 + * F32 -> QS8, QS16 * * - * @param[in] input The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32. - * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32. + * @param[in] input The input tensor to convert. Data type supported: QS8/U8/U16/S16/QS16/F32. + * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/QS16/U32/S32/F32. * @param[in] policy Conversion policy. * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. * It is not used on fixed point conversion. diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp index d1cdd7dc61..1499df0bec 100644 --- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp @@ -56,8 +56,8 @@ void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *ou if(is_data_type_fixed_point(input->info()->data_type())) { ma_arguments << "-DBETA=" << (input->info()->data_type() == DataType::QS8 ? - scvt_qs8_f32(beta, input->info()->fixed_point_position()) : - scvt_qs16_f32(beta, input->info()->fixed_point_position())) + sqcvt_qs8_f32(beta, input->info()->fixed_point_position()) : + sqcvt_qs16_f32(beta, input->info()->fixed_point_position())) << " "; ma_arguments << "-DFIXED_POINT_POSITION=" << input->info()->fixed_point_position(); } diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 2d6b83a97d..c65b9e0e1a 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -77,8 +77,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen if(is_data_type_fixed_point(input0->info()->data_type())) { mm_arguments << "-DALPHA=" << (input0->info()->data_type() == DataType::QS8 ? - scvt_qs8_f32(alpha, input0->info()->fixed_point_position()) : - scvt_qs16_f32(alpha, input0->info()->fixed_point_position())) + sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) : + sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position())) << " "; mm_arguments << "-DFIXED_POINT_POSITION=" << input0->info()->fixed_point_position() << " "; } diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 1bd0353b93..492d197925 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -267,9 +267,9 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation int fixed_point_position = _input->info()->fixed_point_position(); static const qint8x16_t CONST_0 = vdupq_n_qs8(0); - const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position)); - const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position)); - const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position)); + const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position)); + const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position)); + const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position)); execute_window_loop(window, [&](const Coordinates & id) { diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index e6f233cf5d..d0aec6965c 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -58,7 +58,7 @@ void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean qint8x16_t gamma_vec = vdupq_n_qs8(0); qint8x16_t beta_vec = vdupq_n_qs8(0); qint8x16_t denominator = vdupq_n_qs8(0); - const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position)); + const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position)); execute_window_loop(window, [&](const Coordinates & id) { if(slice != id.z()) diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp index 56612a7703..3c1a94df74 100644 --- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp @@ -46,27 +46,35 @@ NEDepthConvertKernel::NEDepthConvertKernel() void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_ERROR_ON(shift >= 8); ARM_COMPUTE_ERROR_ON(input == output); ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32), - "Only data_types supported [in] QS8 -> [out] F32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S32), "Only data_types supported [in] U8 -> [out] U16, S16, S32"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32, + "Only data_types supported [in] QS8 -> [out] F32"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8), - "Only data_types supported [in] F32 -> [out] QS8"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && output->info()->data_type() != DataType::F32, + "Only data_types supported [in] QS16 -> [out] F32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16), + "Only data_types supported [in] F32 -> [out] QS8, QS16"); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); _policy = policy; _shift = shift; @@ -346,6 +354,38 @@ void NEDepthConvertKernel::run(const Window &window) } break; } + case DataType::QS16: + { + const int fixed_point_position = _input->info()->fixed_point_position(); + + switch(_output->info()->data_type()) + { + case DataType::F32: + { + /* Up-conversion QS16 -> F32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())), + vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8) + } + }; + + vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels.val[0]), fixed_point_position)); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels.val[0]), fixed_point_position)); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels.val[1]), fixed_point_position)); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels.val[1]), fixed_point_position)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } case DataType::F32: { switch(_output->info()->data_type()) @@ -366,13 +406,40 @@ void NEDepthConvertKernel::run(const Window &window) } }; - const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position); + const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, fixed_point_position); vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8); }, input, output); break; } + case DataType::QS16: + { + const int fixed_point_position = _output->info()->fixed_point_position(); + /* Down-conversion F32 -> QS16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x2_t texels_f32_1 = + { + { + vld1q_f32(reinterpret_cast<const float *>(input.ptr())), + vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4), + } + }; + const float32x4x2_t texels_f32_2 = + { + { + vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8), + vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12) + } + }; + + vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, fixed_point_position)); + vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, fixed_point_position)); + }, + input, output); + break; + } default: ARM_COMPUTE_ERROR("Output data type not supported"); } diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp index 91fbe6f962..f2cd18d827 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp @@ -94,7 +94,7 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta) { const int fixed_point_position = input->info()->fixed_point_position(); - const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position)); + const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position)); Iterator in(input, window); Iterator out(output, window); @@ -118,7 +118,7 @@ void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &wi void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta) { const int fixed_point_position = input->info()->fixed_point_position(); - const qint16x8_t beta_qs16 = vdupq_n_qs16(scvt_qs16_f32(beta, fixed_point_position)); + const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position)); Iterator in(input, window); Iterator out(output, window); diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp index b81be6cee9..8381dd8a73 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp @@ -456,7 +456,7 @@ void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT // Multiply by the weight of the matrix product (alpha) if(multiply_alpha) { - const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position)); + const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position)); acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position); acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position); acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position); @@ -585,7 +585,7 @@ void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I // Multiply by the weight of the matrix product (alpha) if(multiply_alpha) { - const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position)); + const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position)); acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position); acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position); acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position); @@ -1058,7 +1058,7 @@ void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT const size_t out_stride3 = out_stride1 * 3; const int num_elems_matrix_b_x = input1->info()->dimension(0); const int fixed_point_position = input0->info()->fixed_point_position(); - const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position)); + const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position)); ARM_COMPUTE_UNUSED(alpha_qs8); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix @@ -1291,7 +1291,7 @@ void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I const size_t out_stride3 = out_stride1 * 3; const int num_elems_matrix_b_x = input1->info()->dimension(0); const int fixed_point_position = input0->info()->fixed_point_position(); - const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position)); + const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position)); ARM_COMPUTE_UNUSED(alpha_qs16); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 5bb8b1c22a..e4de60df80 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -136,11 +136,11 @@ inline void linearize_volume(const uint8_t *const in_ptr, { if(std::is_same<T, qint8_t>::value) { - *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position); + *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position); } else if(std::is_same<T, qint16_t>::value) { - *out_ptr = scvt_qs16_f32(1.0f, fixed_point_position); + *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position); } else { @@ -255,11 +255,11 @@ void NEIm2ColKernel::run_reduced(const Window &window) { if(std::is_same<T, qint8_t>::value) { - *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position()); + *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position()); } else if(std::is_same<T, qint16_t>::value) { - *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs16_f32(1.0f, _input->info()->fixed_point_position()); + *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position()); } else { diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp index 011e366bf3..24b51493c6 100644 --- a/src/runtime/NEON/functions/NEDepthConvert.cpp +++ b/src/runtime/NEON/functions/NEDepthConvert.cpp @@ -23,9 +23,7 @@ */ #include "arm_compute/runtime/NEON/functions/NEDepthConvert.h" -#include "arm_compute/core/Error.h" #include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h" -#include "arm_compute/core/Validate.h" #include "support/ToolchainSupport.h" #include <utility> @@ -34,11 +32,6 @@ using namespace arm_compute; void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32); - ARM_COMPUTE_ERROR_ON(input == output); - ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type()); - auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>(); k->configure(input, output, policy, shift); _kernel = std::move(k); diff --git a/tests/TensorLibrary.h b/tests/TensorLibrary.h index 5b2c5b6fd5..3fb593c886 100644 --- a/tests/TensorLibrary.h +++ b/tests/TensorLibrary.h @@ -560,6 +560,7 @@ void TensorLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t break; } case DataType::S16: + case DataType::QS16: { ARM_COMPUTE_ERROR_ON(!(std::is_same<int16_t, D>::value)); std::uniform_int_distribution<int16_t> distribution_s16(low, high); diff --git a/tests/validation/FixedPoint.h b/tests/validation/FixedPoint.h index 53f532cd8c..261fcd6df6 100644 --- a/tests/validation/FixedPoint.h +++ b/tests/validation/FixedPoint.h @@ -333,7 +333,7 @@ struct constant_expr */ static constexpr T to_fixed(float val, uint8_t p) { - return static_cast<T>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5)); + return static_cast<T>(saturate_cast<float>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5))); } /** Clamp value between two ranges * diff --git a/tests/validation/NEON/DepthConvert.cpp b/tests/validation/NEON/DepthConvert.cpp index 4a37d98990..65d3ab1be7 100644 --- a/tests/validation/NEON/DepthConvert.cpp +++ b/tests/validation/NEON/DepthConvert.cpp @@ -208,6 +208,89 @@ BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ Co } BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE(QS16_to_F32) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE }) + * boost::unit_test::data::xrange(1, 15, 1), + shape, policy, fixed_point_position) +{ + // Compute configure and validate region/padding + compute_configure_validate(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE }) + * boost::unit_test::data::xrange(1, 15, 1), + shape, policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE }) + * boost::unit_test::data::xrange(1, 15, 1), + shape, policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} + +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE(F32_to_QS16) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE }) + * boost::unit_test::data::xrange(1, 7, 1), + shape, policy, fixed_point_position) +{ + // Compute configure and validate region/padding + compute_configure_validate(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE }) + * boost::unit_test::data::xrange(1, 15, 1), + shape, policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE }) + * boost::unit_test::data::xrange(1, 15, 1), + shape, policy, fixed_point_position) +{ + // Compute function + Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} +BOOST_AUTO_TEST_SUITE_END() + BOOST_AUTO_TEST_SUITE(U8_to_U16) BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly")) diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h index 0502f53186..e90635f0d4 100644 --- a/tests/validation/TensorOperations.h +++ b/tests/validation/TensorOperations.h @@ -518,94 +518,58 @@ void box3x3(const Tensor<T> &in, Tensor<T> &out, BorderMode border_mode, T const } // Depth conversion -template <typename T1, typename T2> +template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 > void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift) { - ARM_COMPUTE_ERROR("The conversion is not supported"); -} - -template <> -void depth_convert<int8_t, float>(const Tensor<int8_t> &in, Tensor<float> &out, ConvertPolicy policy, uint32_t shift) -{ - const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position()); - for(int i = 0; i < in.num_elements(); ++i) - { - out[i] = static_cast<float>(in[i]) * (1.0f / (1 << fixed_point_position)); - } -} - -template <> -void depth_convert<float, int8_t>(const Tensor<float> &in, Tensor<int8_t> &out, ConvertPolicy policy, uint32_t shift) -{ - const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position()); - for(int i = 0; i < in.num_elements(); ++i) - { - float val = in[i] * (1 << fixed_point_position) + 0.5f; - out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<int8_t>(val) : static_cast<int8_t>(val)); - } -} + using namespace fixed_point_arithmetic; -template <> -void depth_convert<uint8_t, uint16_t>(const Tensor<uint8_t> &in, Tensor<uint16_t> &out, ConvertPolicy policy, uint32_t shift) -{ + const int fixed_point_position = in.fixed_point_position(); for(int i = 0; i < in.num_elements(); ++i) { - out[i] = static_cast<uint16_t>(in[i]) << shift; + out[i] = static_cast<float>(fixed_point<T1>(in[i], fixed_point_position, true)); } } -template <> -void depth_convert<uint8_t, int16_t>(const Tensor<uint8_t> &in, Tensor<int16_t> &out, ConvertPolicy policy, uint32_t shift) +template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_integral<T2>::value, int >::type = 0 > +void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift) { - for(int i = 0; i < in.num_elements(); ++i) - { - out[i] = static_cast<int16_t>(in[i]) << shift; - } -} + using namespace fixed_point_arithmetic; -template <> -void depth_convert<uint8_t, int32_t>(const Tensor<uint8_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift) -{ + const int fixed_point_position = out.fixed_point_position(); for(int i = 0; i < in.num_elements(); ++i) { - out[i] = static_cast<int32_t>(in[i]) << shift; + out[i] = fixed_point<T2>(in[i], fixed_point_position).raw(); } } -template <> -void depth_convert<uint16_t, uint8_t>(const Tensor<uint16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift) +template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value, int >::type = 0 > +void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift) { - for(int i = 0; i < in.num_elements(); ++i) + // Up-casting + if(std::numeric_limits<T1>::digits <= std::numeric_limits<T2>::digits) { - uint16_t val = in[i] >> shift; - out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val)); + for(int i = 0; i < in.num_elements(); ++i) + { + out[i] = static_cast<T2>(in[i]) << shift; + } } -} - -template <> -void depth_convert<uint16_t, uint32_t>(const Tensor<uint16_t> &in, Tensor<uint32_t> &out, ConvertPolicy policy, uint32_t shift) -{ - for(int i = 0; i < in.num_elements(); ++i) + // Down-casting + else { - out[i] = static_cast<uint32_t>(in[i]) << shift; + for(int i = 0; i < in.num_elements(); ++i) + { + T1 val = in[i] >> shift; + out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<T2>(val) : static_cast<T2>(val)); + } } } -template <> -void depth_convert<int16_t, uint8_t>(const Tensor<int16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift) -{ - for(int i = 0; i < in.num_elements(); ++i) - { - int16_t val = in[i] >> shift; - out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val)); - } -} -template <> -void depth_convert<int16_t, int32_t>(const Tensor<int16_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift) +template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 > +void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift) { for(int i = 0; i < in.num_elements(); ++i) { - out[i] = static_cast<int32_t>(in[i]) << shift; + out[i] = static_cast<T2>(in[i]); } } |