aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2017-07-04 12:47:17 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:15:39 +0100
commit21efeb4491feab09dc246f4da0023d7ca79b1d32 (patch)
tree2fbacd7676d13a3ec7024e517acf2e462355275a /src/core/NEON
parent368da83fdd7406d629e8cca64f3eb0af05437419 (diff)
downloadComputeLibrary-21efeb4491feab09dc246f4da0023d7ca79b1d32.tar.gz
COMPMID-417: DepthConvert NEON for QS8/QS16.
Change-Id: Ieb120bccf146045b3a0001ceb3893d4e67fd19df Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79763 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Steven Niu <steven.niu@arm.com>
Diffstat (limited to 'src/core/NEON')
-rw-r--r--src/core/NEON/kernels/NEActivationLayerKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEDepthConvertKernel.cpp83
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp8
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp8
6 files changed, 89 insertions, 22 deletions
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 1bd0353b93..492d197925 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -267,9 +267,9 @@ typename std::enable_if<std::is_same<T, int8_t>::value, void>::type NEActivation
int fixed_point_position = _input->info()->fixed_point_position();
static const qint8x16_t CONST_0 = vdupq_n_qs8(0);
- const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position));
- const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position));
- const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position));
+ const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position));
+ const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position));
+ const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
{
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index e6f233cf5d..d0aec6965c 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -58,7 +58,7 @@ void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean
qint8x16_t gamma_vec = vdupq_n_qs8(0);
qint8x16_t beta_vec = vdupq_n_qs8(0);
qint8x16_t denominator = vdupq_n_qs8(0);
- const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position));
+ const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
execute_window_loop(window, [&](const Coordinates & id)
{
if(slice != id.z())
diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
index 56612a7703..3c1a94df74 100644
--- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp
@@ -46,27 +46,35 @@ NEDepthConvertKernel::NEDepthConvertKernel()
void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_ERROR_ON(shift >= 8);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32),
- "Only data_types supported [in] QS8 -> [out] F32");
-
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16
&& output->info()->data_type() != DataType::S32),
"Only data_types supported [in] U8 -> [out] U16, S16, S32");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32,
+ "Only data_types supported [in] QS8 -> [out] F32");
+
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32),
"Only data_types supported [in] U16 -> [out] U8, U32");
ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32),
"Only data_types supported [in] S16 -> [out] U8, S32");
- ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8),
- "Only data_types supported [in] F32 -> [out] QS8");
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && output->info()->data_type() != DataType::F32,
+ "Only data_types supported [in] QS16 -> [out] F32");
+
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16),
+ "Only data_types supported [in] F32 -> [out] QS8, QS16");
+
+ // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
+ set_shape_if_empty(*output->info(), input->info()->tensor_shape());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
_policy = policy;
_shift = shift;
@@ -346,6 +354,38 @@ void NEDepthConvertKernel::run(const Window &window)
}
break;
}
+ case DataType::QS16:
+ {
+ const int fixed_point_position = _input->info()->fixed_point_position();
+
+ switch(_output->info()->data_type())
+ {
+ case DataType::F32:
+ {
+ /* Up-conversion QS16 -> F32 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int16x8x2_t texels =
+ {
+ {
+ vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr())),
+ vld1q_s16(reinterpret_cast<qint16_t *>(input.ptr()) + 8)
+ }
+ };
+
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels.val[0]), fixed_point_position));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels.val[0]), fixed_point_position));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels.val[1]), fixed_point_position));
+ vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels.val[1]), fixed_point_position));
+ },
+ input, output);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Output data type not supported");
+ }
+ break;
+ }
case DataType::F32:
{
switch(_output->info()->data_type())
@@ -366,13 +406,40 @@ void NEDepthConvertKernel::run(const Window &window)
}
};
- const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position);
+ const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, fixed_point_position);
vst1q_s8(reinterpret_cast<int8_t *>(output.ptr()), texels_s8);
},
input, output);
break;
}
+ case DataType::QS16:
+ {
+ const int fixed_point_position = _output->info()->fixed_point_position();
+ /* Down-conversion F32 -> QS16 */
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const float32x4x2_t texels_f32_1 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr())),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 4),
+ }
+ };
+ const float32x4x2_t texels_f32_2 =
+ {
+ {
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 8),
+ vld1q_f32(reinterpret_cast<const float *>(input.ptr()) + 12)
+ }
+ };
+
+ vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, fixed_point_position));
+ vst1q_s16(reinterpret_cast<qint16_t *>(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, fixed_point_position));
+ },
+ input, output);
+ break;
+ }
default:
ARM_COMPUTE_ERROR("Output data type not supported");
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 91fbe6f962..f2cd18d827 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -94,7 +94,7 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi
void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta)
{
const int fixed_point_position = input->info()->fixed_point_position();
- const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position));
+ const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position));
Iterator in(input, window);
Iterator out(output, window);
@@ -118,7 +118,7 @@ void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &wi
void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
{
const int fixed_point_position = input->info()->fixed_point_position();
- const qint16x8_t beta_qs16 = vdupq_n_qs16(scvt_qs16_f32(beta, fixed_point_position));
+ const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position));
Iterator in(input, window);
Iterator out(output, window);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index b81be6cee9..8381dd8a73 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
@@ -456,7 +456,7 @@ void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT
// Multiply by the weight of the matrix product (alpha)
if(multiply_alpha)
{
- const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position);
acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position);
acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position);
@@ -585,7 +585,7 @@ void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I
// Multiply by the weight of the matrix product (alpha)
if(multiply_alpha)
{
- const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position));
+ const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position);
acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position);
acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position);
@@ -1058,7 +1058,7 @@ void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT
const size_t out_stride3 = out_stride1 * 3;
const int num_elems_matrix_b_x = input1->info()->dimension(0);
const int fixed_point_position = input0->info()->fixed_point_position();
- const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position));
+ const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position));
ARM_COMPUTE_UNUSED(alpha_qs8);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
@@ -1291,7 +1291,7 @@ void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I
const size_t out_stride3 = out_stride1 * 3;
const int num_elems_matrix_b_x = input1->info()->dimension(0);
const int fixed_point_position = input0->info()->fixed_point_position();
- const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position));
+ const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position));
ARM_COMPUTE_UNUSED(alpha_qs16);
// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 5bb8b1c22a..e4de60df80 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -136,11 +136,11 @@ inline void linearize_volume(const uint8_t *const in_ptr,
{
if(std::is_same<T, qint8_t>::value)
{
- *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position);
+ *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position);
}
else if(std::is_same<T, qint16_t>::value)
{
- *out_ptr = scvt_qs16_f32(1.0f, fixed_point_position);
+ *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position);
}
else
{
@@ -255,11 +255,11 @@ void NEIm2ColKernel::run_reduced(const Window &window)
{
if(std::is_same<T, qint8_t>::value)
{
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position());
}
else if(std::is_same<T, qint16_t>::value)
{
- *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = scvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
+ *(reinterpret_cast<T *>(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position());
}
else
{