aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-07-04 12:47:17 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:15:39 +0100
commit21efeb4491feab09dc246f4da0023d7ca79b1d32 (patch)
tree2fbacd7676d13a3ec7024e517acf2e462355275a
parent368da83fdd7406d629e8cca64f3eb0af05437419 (diff)
downloadComputeLibrary-21efeb4491feab09dc246f4da0023d7ca79b1d32.tar.gz
COMPMID-417: DepthConvert NEON for QS8/QS16.
Change-Id: Ieb120bccf146045b3a0001ceb3893d4e67fd19df Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79763 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Steven Niu <steven.niu@arm.com>
-rw-r--r--arm_compute/core/FixedPoint.h4
-rw-r--r--arm_compute/core/FixedPoint.inl8
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.h16
-rw-r--r--arm_compute/core/NEON/NEFixedPoint.inl28
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthConvert.h7
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp4
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEActivationLayerKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEDepthConvertKernel.cpp83
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp8
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp8
-rw-r--r--src/runtime/NEON/functions/NEDepthConvert.cpp7
-rw-r--r--tests/TensorLibrary.h1
-rw-r--r--tests/validation/FixedPoint.h2
-rw-r--r--tests/validation/NEON/DepthConvert.cpp83
-rw-r--r--tests/validation/TensorOperations.h90
18 files changed, 237 insertions, 128 deletions
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index da304c6329..5eb4c55c41 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -296,7 +296,7 @@ float scvt_f32_qs8(qint8_t a, int fixed_point_position);
*
* @return The result of the conversion float -> 8 bit fixed point
*/
-qint8_t scvt_qs8_f32(float a, int fixed_point_position);
+qint8_t sqcvt_qs8_f32(float a, int fixed_point_position);
/** Convert a 16 bit fixed point to float
*
@@ -314,7 +314,7 @@ float scvt_f32_qs16(qint16_t a, int fixed_point_position);
*
* @return The result of the conversion float -> 16 bit fixed point
*/
-qint16_t scvt_qs16_f32(float a, int fixed_point_position);
+qint16_t sqcvt_qs16_f32(float a, int fixed_point_position);
/** Scalar saturating move and narrow.
*
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
index fab91d6699..fdbc3f0c06 100644
--- a/arm_compute/core/FixedPoint.inl
+++ b/arm_compute/core/FixedPoint.inl
@@ -366,10 +366,10 @@ inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
return static_cast<float>(a) / (1 << fixed_point_position);
}
-inline qint8_t scvt_qs8_f32(float a, int fixed_point_position)
+inline qint8_t sqcvt_qs8_f32(float a, int fixed_point_position)
{
// round_nearest_integer(a * 2^(fixed_point_position))
- return static_cast<qint8_t>(a * (1 << fixed_point_position) + 0.5f);
+ return saturate_convert<float, qint8_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
}
inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
@@ -377,10 +377,10 @@ inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
return static_cast<float>(a) / (1 << fixed_point_position);
}
-inline qint16_t scvt_qs16_f32(float a, int fixed_point_position)
+inline qint16_t sqcvt_qs16_f32(float a, int fixed_point_position)
{
// round_nearest_integer(a * 2^(fixed_point_position))
- return static_cast<qint16_t>(a * (1 << fixed_point_position) + 0.5f);
+ return saturate_convert<float, qint16_t>(a * (1 << fixed_point_position) + ((a >= 0) ? 0.5 : -0.5));
}
inline qint8_t sqmovn_qs16(qint16_t a)
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 660464eb62..e3eb5d4638 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -788,36 +788,36 @@ qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point
* @param[in] a Float input vector
* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
*
- * @return The result of the conversion float -> 8 bit fixed point
+ * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
*/
-qint8x8_t vcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
+qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
/** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements
*
* @param[in] a Float input vector
* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
*
- * @return The result of the conversion float -> 16 bit fixed point
+ * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
*/
-qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
+qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
*
* @param[in] a Float input vector
* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
*
- * @return The result of the conversion float -> 8 bit fixed point
+ * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
*/
-qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
+qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
/** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements
*
* @param[in] a Float input vector
* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
*
- * @return The result of the conversion float -> 16 bit fixed point
+ * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
*/
-qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
+qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
*
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 4f7f44ab3b..92af82cf71 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -236,7 +236,7 @@ inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
vdupq_n_f32(a),
}
};
- return vcvtq_qs8_f32(res, fixed_point_position);
+ return vqcvtq_qs8_f32(res, fixed_point_position);
}
inline qint16x8_t vdupq_n_qs16(qint16_t a)
@@ -809,15 +809,15 @@ inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixe
return vqaddq_s32(a, tmp);
}
-inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
+inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
{
const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
float32x4x2_t res_f32 =
{
{
- vdupq_n_f32(0.5f),
- vdupq_n_f32(0.5f)
+ vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+ vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
}
};
@@ -837,11 +837,11 @@ inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
return vqmovn_s16(res_s16);
}
-inline qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
+inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
{
const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
- float32x4_t res_f32 = vdupq_n_f32(0.5f);
+ float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
res_f32 = vmlaq_f32(res_f32, a, pow2);
@@ -850,17 +850,17 @@ inline qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
return vqmovn_s32(res_s32);
}
-inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
+inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
{
const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
float32x4x4_t res_f32 =
{
{
- vdupq_n_f32(0.5f),
- vdupq_n_f32(0.5f),
- vdupq_n_f32(0.5f),
- vdupq_n_f32(0.5f)
+ vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+ vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+ vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+ vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
}
};
@@ -890,15 +890,15 @@ inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position
return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
}
-inline qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
+inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
{
const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
float32x4x2_t res_f32 =
{
{
- vdupq_n_f32(0.5f),
- vdupq_n_f32(0.5f)
+ vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
+ vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
}
};
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
index 7c59ce432d..47b3a7e6f6 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvert.h
@@ -52,11 +52,12 @@ public:
* U8 -> U16, S16, S32
* U16 -> U8, U32
* S16 -> U8, S32
- * F32 -> QS8
+ * QS16 -> F32
+ * F32 -> QS8, QS16
*
*
- * @param[in] input The input tensor to convert. Data type supported: QS8/U8/U16/S16/F32.
- * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/U32/S32/F32.
+ * @param[in] input The input tensor to convert. Data type supported: QS8/U8/U16/S16/QS16/F32.
+ * @param[out] output The output tensor. Data type supported: QS8/U8/U16/S16/QS16/U32/S32/F32.
* @param[in] policy Conversion policy.
* @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8.
* It is not used on fixed point conversion.
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
index d1cdd7dc61..1499df0bec 100644
--- a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -56,8 +56,8 @@ void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *ou
if(is_data_type_fixed_point(input->info()->data_type()))
{
ma_arguments << "-DBETA=" << (input->info()->data_type() == DataType::QS8 ?
- scvt_qs8_f32(beta, input->info()->fixed_point_position()) :
- scvt_qs16_f32(beta, input->info()->fixed_point_position()))
+ sqcvt_qs8_f32(beta, input->info()->fixed_point_position()) :
+ sqcvt_qs16_f32(beta, input->info()->fixed_point_position()))
<< " ";
ma_arguments << "-DFIXED_POINT_POSITION=" << input->info()->fixed_point_position();
}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 2d6b83a97d..c65b9e0e1a 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -77,8 +77,8 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
if(is_data_type_fixed_point(input0->info()->data_type()))
{
mm_arguments << "-DALPHA=" << (input0->info()->data_type() == DataType::QS8 ?
- scvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
- scvt_qs16_f32(alpha, input0->info()->fixed_point_position()))
+ sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) :
+ sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position()))
<< " ";
mm_arguments << "-DFIXED_POINT_POSITION=" << input0->info()->fixed_point_position() << " ";
}
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 1bd0353b93..492d197925 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -267,9 +267,9 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation
int fixed_point_position = _input->info()->fixed_point_position();
static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
- const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
- const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
- const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+ const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
+ const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
+ const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
{
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index e6f233cf5d..d0aec6965c 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -58,7 +58,7 @@ void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean
qint8x16_t gamma_vec = vdupq_n_qs8(0);
qint8x16_t beta_vec = vdupq_n_qs8(0);
qint8x16_t denominator = vdupq_n_qs8(0);
- const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+ const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
{
if(slice != id.z())
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index 56612a7703..3c1a94df74 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -46,27 +46,35 @@ NEDepthConvertKernel::NEDepthConvertKernel()
void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(shift >= 8);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
- "Only data_types supported [in] QS8 -> [out] F32");
-
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
&& output->info()->data_type() != DataType::S32),
"Only data_types supported [in] U8 -> [out] U16, S16, S32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
+ "Only data_types supported [in] QS8 -> [out] F32");
+
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
"Only data_types supported [in] U16 -> [out] U8, U32");
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
"Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
- "Only data_types supported [in] F32 -> [out] QS8");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && output->info()->data_type() != DataType::F32,
+ "Only data_types supported [in] QS16 -> [out] F32");
+
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
+ "Only data_types supported [in] F32 -> [out] QS8, QS16");
+
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
_policy = policy;
_shift = shift;
@@ -346,6 +354,38 @@ void NEDepthConvertKernel::run(const Window &window)
}
break;
}
+ case DataType::QS16:
+ {
+ const int fixed_point_position = _input->info()->fixed_point_position();
+
+ switch(_output->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ /* Up-conversion QS16 -> F32 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int16x8x2_t texels =
+ {
+ {
+ vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
+ vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+ }
+ };
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels.val[0]), fixed_point_position));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels.val[0]), fixed_point_position));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels.val[1]), fixed_point_position));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels.val[1]), fixed_point_position));
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
case DataType::F32:
{
switch(_output->info()->data_type())
@@ -366,13 +406,40 @@ void NEDepthConvertKernel::run(const Window &window)
}
};
- const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+ const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, fixed_point_position);
vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
},
input, output);
break;
}
+ case DataType::QS16:
+ {
+ const int fixed_point_position = _output->info()->fixed_point_position();
+ /* Down-conversion F32 -> QS16 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x2_t texels_f32_1 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+ }
+ };
+ const float32x4x2_t texels_f32_2 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+ }
+ };
+
+ vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, fixed_point_position));
+ vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, fixed_point_position));
+ },
+ input, output);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Output data type not supported");
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 91fbe6f962..f2cd18d827 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -94,7 +94,7 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi
void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
{
const int fixed_point_position = input->info()->fixed_point_position();
- const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+ const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
Iterator in(input, window);
Iterator out(output, window);
@@ -118,7 +118,7 @@ void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &wi
void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
{
const int fixed_point_position = input->info()->fixed_point_position();
- const qint16x8_t beta_qs16 = vdupq_n_qs16(scvt_qs16_f32(beta, fixed_point_position));
+ const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
Iterator in(input, window);
Iterator out(output, window);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index b81be6cee9..8381dd8a73 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -456,7 +456,7 @@ void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT
// Multiply by the weight of the matrix product (alpha)
if(multiply_alpha)
{
- const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
@@ -585,7 +585,7 @@ void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I
// Multiply by the weight of the matrix product (alpha)
if(multiply_alpha)
{
- const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position));
+ const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
@@ -1058,7 +1058,7 @@ void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT
const size_t out_stride3 = out_stride1 * 3;
const int num_elems_matrix_b_x = input1->info()->dimension(0);
const int fixed_point_position = input0->info()->fixed_point_position();
- const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
ARM_COMPUTE_UNUSED(alpha_qs8);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
@@ -1291,7 +1291,7 @@ void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I
const size_t out_stride3 = out_stride1 * 3;
const int num_elems_matrix_b_x = input1->info()->dimension(0);
const int fixed_point_position = input0->info()->fixed_point_position();
- const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position));
+ const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
ARM_COMPUTE_UNUSED(alpha_qs16);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 5bb8b1c22a..e4de60df80 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -136,11 +136,11 @@ inline void linearize_volume(const uint8_t *const in_ptr,
{
if(std::is_same<T, qint8_t>::value)
{
- *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+ *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
}
else if(std::is_same<T, qint16_t>::value)
{
- *out_ptr = scvt_qs16_f32(1.0f, fixed_point_position);
+ *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
}
else
{
@@ -255,11 +255,11 @@ void NEIm2ColKernel::run_reduced(const Window &window)
{
if(std::is_same<T, qint8_t>::value)
{
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
}
else if(std::is_same<T, qint16_t>::value)
{
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
}
else
{
diff --git a/src/runtime/NEON/functions/NEDepthConvert.cpp b/src/runtime/NEON/functions/NEDepthConvert.cpp
index 011e366bf3..24b51493c6 100644
--- a/src/runtime/NEON/functions/NEDepthConvert.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvert.cpp
@@ -23,9 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h"
-#include "arm_compute/core/Error.h"
#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h"
-#include "arm_compute/core/Validate.h"
#include "support/ToolchainSupport.h"
#include <utility>
@@ -34,11 +32,6 @@ using namespace arm_compute;
void NEDepthConvert::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32);
- ARM_COMPUTE_ERROR_ON(input == output);
- ARM_COMPUTE_ERROR_ON(input->info()->data_type() == output->info()->data_type());
-
auto k = arm_compute::support::cpp14::make_unique<NEDepthConvertKernel>();
k->configure(input, output, policy, shift);
_kernel = std::move(k);
diff --git a/tests/TensorLibrary.h b/tests/TensorLibrary.h
index 5b2c5b6fd5..3fb593c886 100644
--- a/tests/TensorLibrary.h
+++ b/tests/TensorLibrary.h
@@ -560,6 +560,7 @@ void TensorLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
break;
}
case DataType::S16:
+ case DataType::QS16:
{
ARM_COMPUTE_ERROR_ON(!(std::is_same<int16_t, D>::value));
std::uniform_int_distribution<int16_t> distribution_s16(low, high);
diff --git a/tests/validation/FixedPoint.h b/tests/validation/FixedPoint.h
index 53f532cd8c..261fcd6df6 100644
--- a/tests/validation/FixedPoint.h
+++ b/tests/validation/FixedPoint.h
@@ -333,7 +333,7 @@ struct constant_expr
*/
static constexpr T to_fixed(float val, uint8_t p)
{
- return static_cast<T>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5));
+ return static_cast<T>(saturate_cast<float>(val * fixed_one(p) + ((val >= 0) ? 0.5 : -0.5)));
}
/** Clamp value between two ranges
*
diff --git a/tests/validation/NEON/DepthConvert.cpp b/tests/validation/NEON/DepthConvert.cpp
index 4a37d98990..65d3ab1be7 100644
--- a/tests/validation/NEON/DepthConvert.cpp
+++ b/tests/validation/NEON/DepthConvert.cpp
@@ -208,6 +208,89 @@ BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ Co
}
BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE(QS16_to_F32)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+ * boost::unit_test::data::xrange(1, 15, 1),
+ shape, policy, fixed_point_position)
+{
+ // Compute configure and validate region/padding
+ compute_configure_validate(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+ * boost::unit_test::data::xrange(1, 15, 1),
+ shape, policy, fixed_point_position)
+{
+ // Compute function
+ Tensor dst = compute_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+ // Compute reference
+ RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+ // Validate output
+ validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+ * boost::unit_test::data::xrange(1, 15, 1),
+ shape, policy, fixed_point_position)
+{
+ // Compute function
+ Tensor dst = compute_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+ // Compute reference
+ RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::QS16, DataType::F32, policy, 0, fixed_point_position);
+
+ // Validate output
+ validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
+BOOST_AUTO_TEST_SUITE(F32_to_QS16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(Configuration, (SmallShapes() + LargeShapes()) * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+ * boost::unit_test::data::xrange(1, 7, 1),
+ shape, policy, fixed_point_position)
+{
+ // Compute configure and validate region/padding
+ compute_configure_validate(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+ * boost::unit_test::data::xrange(1, 15, 1),
+ shape, policy, fixed_point_position)
+{
+ // Compute function
+ Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+ // Compute reference
+ RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+ // Validate output
+ validate(NEAccessor(dst), ref_dst);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::make({ ConvertPolicy::SATURATE })
+ * boost::unit_test::data::xrange(1, 15, 1),
+ shape, policy, fixed_point_position)
+{
+ // Compute function
+ Tensor dst = compute_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+ // Compute reference
+ RawTensor ref_dst = Reference::compute_reference_depth_convert(shape, DataType::F32, DataType::QS16, policy, 0, fixed_point_position);
+
+ // Validate output
+ validate(NEAccessor(dst), ref_dst);
+}
+BOOST_AUTO_TEST_SUITE_END()
+
BOOST_AUTO_TEST_SUITE(U8_to_U16)
BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h
index 0502f53186..e90635f0d4 100644
--- a/tests/validation/TensorOperations.h
+++ b/tests/validation/TensorOperations.h
@@ -518,94 +518,58 @@ void box3x3(const Tensor<T> &in, Tensor<T> &out, BorderMode border_mode, T const
}
// Depth conversion
-template <typename T1, typename T2>
+template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 >
void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR("The conversion is not supported");
-}
-
-template <>
-void depth_convert<int8_t, float>(const Tensor<int8_t> &in, Tensor<float> &out, ConvertPolicy policy, uint32_t shift)
-{
- const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position());
- for(int i = 0; i < in.num_elements(); ++i)
- {
- out[i] = static_cast<float>(in[i]) * (1.0f / (1 << fixed_point_position));
- }
-}
-
-template <>
-void depth_convert<float, int8_t>(const Tensor<float> &in, Tensor<int8_t> &out, ConvertPolicy policy, uint32_t shift)
-{
- const int8_t fixed_point_position = static_cast<int8_t>(in.fixed_point_position());
- for(int i = 0; i < in.num_elements(); ++i)
- {
- float val = in[i] * (1 << fixed_point_position) + 0.5f;
- out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<int8_t>(val) : static_cast<int8_t>(val));
- }
-}
+ using namespace fixed_point_arithmetic;
-template <>
-void depth_convert<uint8_t, uint16_t>(const Tensor<uint8_t> &in, Tensor<uint16_t> &out, ConvertPolicy policy, uint32_t shift)
-{
+ const int fixed_point_position = in.fixed_point_position();
for(int i = 0; i < in.num_elements(); ++i)
{
- out[i] = static_cast<uint16_t>(in[i]) << shift;
+ out[i] = static_cast<float>(fixed_point<T1>(in[i], fixed_point_position, true));
}
}
-template <>
-void depth_convert<uint8_t, int16_t>(const Tensor<uint8_t> &in, Tensor<int16_t> &out, ConvertPolicy policy, uint32_t shift)
+template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_integral<T2>::value, int >::type = 0 >
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
{
- for(int i = 0; i < in.num_elements(); ++i)
- {
- out[i] = static_cast<int16_t>(in[i]) << shift;
- }
-}
+ using namespace fixed_point_arithmetic;
-template <>
-void depth_convert<uint8_t, int32_t>(const Tensor<uint8_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift)
-{
+ const int fixed_point_position = out.fixed_point_position();
for(int i = 0; i < in.num_elements(); ++i)
{
- out[i] = static_cast<int32_t>(in[i]) << shift;
+ out[i] = fixed_point<T2>(in[i], fixed_point_position).raw();
}
}
-template <>
-void depth_convert<uint16_t, uint8_t>(const Tensor<uint16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift)
+template < typename T1, typename T2, typename std::enable_if < std::is_integral<T1>::value &&std::is_integral<T2>::value, int >::type = 0 >
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
{
- for(int i = 0; i < in.num_elements(); ++i)
+ // Up-casting
+ if(std::numeric_limits<T1>::digits <= std::numeric_limits<T2>::digits)
{
- uint16_t val = in[i] >> shift;
- out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val));
+ for(int i = 0; i < in.num_elements(); ++i)
+ {
+ out[i] = static_cast<T2>(in[i]) << shift;
+ }
}
-}
-
-template <>
-void depth_convert<uint16_t, uint32_t>(const Tensor<uint16_t> &in, Tensor<uint32_t> &out, ConvertPolicy policy, uint32_t shift)
-{
- for(int i = 0; i < in.num_elements(); ++i)
+ // Down-casting
+ else
{
- out[i] = static_cast<uint32_t>(in[i]) << shift;
+ for(int i = 0; i < in.num_elements(); ++i)
+ {
+ T1 val = in[i] >> shift;
+ out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<T2>(val) : static_cast<T2>(val));
+ }
}
}
-template <>
-void depth_convert<int16_t, uint8_t>(const Tensor<int16_t> &in, Tensor<uint8_t> &out, ConvertPolicy policy, uint32_t shift)
-{
- for(int i = 0; i < in.num_elements(); ++i)
- {
- int16_t val = in[i] >> shift;
- out[i] = ((policy == ConvertPolicy::SATURATE) ? saturate_cast<uint8_t>(val) : static_cast<uint8_t>(val));
- }
-}
-template <>
-void depth_convert<int16_t, int32_t>(const Tensor<int16_t> &in, Tensor<int32_t> &out, ConvertPolicy policy, uint32_t shift)
+template < typename T1, typename T2, typename std::enable_if < std::is_floating_point<T1>::value &&std::is_floating_point<T2>::value, int >::type = 0 >
+void depth_convert(const Tensor<T1> &in, Tensor<T2> &out, ConvertPolicy policy, uint32_t shift)
{
for(int i = 0; i < in.num_elements(); ++i)
{
- out[i] = static_cast<int32_t>(in[i]) << shift;
+ out[i] = static_cast<T2>(in[i]);
}
}