From 21efeb4491feab09dc246f4da0023d7ca79b1d32 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 4 Jul 2017 12:47:17 +0100 Subject: COMPMID-417: DepthConvert NEON for QS8/QS16. Change-Id: Ieb120bccf146045b3a0001ceb3893d4e67fd19df Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79763 Tested-by: Kaizen Reviewed-by: Steven Niu --- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 6 +- .../kernels/NEBatchNormalizationLayerKernel.cpp | 2 +- src/core/NEON/kernels/NEDepthConvertKernel.cpp | 83 +++++++++++++++++++--- .../NEON/kernels/NEGEMMMatrixAdditionKernel.cpp | 4 +- .../NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp | 8 +-- src/core/NEON/kernels/NEIm2ColKernel.cpp | 8 +-- 6 files changed, 89 insertions(+), 22 deletions(-) (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index 1bd0353b93..492d197925 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -267,9 +267,9 @@ typename std::enable_if::value, void>::type NEActivation int fixed_point_position = _input->info()->fixed_point_position(); static const qint8x16_t CONST_0 = vdupq_n_qs8(0); - const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position)); - const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position)); - const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position)); + const qint8x16_t CONST_1 = vdupq_n_qs8(sqcvt_qs8_f32(1.f, fixed_point_position)); + const qint8x16_t a = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.a(), fixed_point_position)); + const qint8x16_t b = vdupq_n_qs8(sqcvt_qs8_f32(_act_info.b(), fixed_point_position)); execute_window_loop(window, [&](const Coordinates & id) { diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index e6f233cf5d..d0aec6965c 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -58,7 +58,7 @@ void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean qint8x16_t gamma_vec = vdupq_n_qs8(0); qint8x16_t beta_vec = vdupq_n_qs8(0); qint8x16_t denominator = vdupq_n_qs8(0); - const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position)); + const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position)); execute_window_loop(window, [&](const Coordinates & id) { if(slice != id.z()) diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp index 56612a7703..3c1a94df74 100644 --- a/src/core/NEON/kernels/NEDepthConvertKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp @@ -46,27 +46,35 @@ NEDepthConvertKernel::NEDepthConvertKernel() void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::QS16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_ERROR_ON(shift >= 8); ARM_COMPUTE_ERROR_ON(input == output); ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32), - "Only data_types supported [in] QS8 -> [out] F32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S32), "Only data_types supported [in] U8 -> [out] U16, S16, S32"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && output->info()->data_type() != DataType::F32, + "Only data_types supported [in] QS8 -> [out] F32"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8), - "Only data_types supported [in] F32 -> [out] QS8"); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS16 && output->info()->data_type() != DataType::F32, + "Only data_types supported [in] QS16 -> [out] F32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8 && output->info()->data_type() != DataType::QS16), + "Only data_types supported [in] F32 -> [out] QS8, QS16"); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); _policy = policy; _shift = shift; @@ -346,6 +354,38 @@ void NEDepthConvertKernel::run(const Window &window) } break; } + case DataType::QS16: + { + const int fixed_point_position = _input->info()->fixed_point_position(); + + switch(_output->info()->data_type()) + { + case DataType::F32: + { + /* Up-conversion QS16 -> F32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vld1q_s16(reinterpret_cast(input.ptr())), + vld1q_s16(reinterpret_cast(input.ptr()) + 8) + } + }; + + vst1q_f32(reinterpret_cast(output.ptr()), vcvt_f32_qs16(vget_low_s16(texels.val[0]), fixed_point_position)); + vst1q_f32(reinterpret_cast(output.ptr()) + 4, vcvt_f32_qs16(vget_high_s16(texels.val[0]), fixed_point_position)); + vst1q_f32(reinterpret_cast(output.ptr()) + 8, vcvt_f32_qs16(vget_low_s16(texels.val[1]), fixed_point_position)); + vst1q_f32(reinterpret_cast(output.ptr()) + 12, vcvt_f32_qs16(vget_high_s16(texels.val[1]), fixed_point_position)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } case DataType::F32: { switch(_output->info()->data_type()) @@ -366,13 +406,40 @@ void NEDepthConvertKernel::run(const Window &window) } }; - const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position); + const qint8x16_t texels_s8 = vqcvtq_qs8_f32(texels_f32, fixed_point_position); vst1q_s8(reinterpret_cast(output.ptr()), texels_s8); }, input, output); break; } + case DataType::QS16: + { + const int fixed_point_position = _output->info()->fixed_point_position(); + /* Down-conversion F32 -> QS16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x2_t texels_f32_1 = + { + { + vld1q_f32(reinterpret_cast(input.ptr())), + vld1q_f32(reinterpret_cast(input.ptr()) + 4), + } + }; + const float32x4x2_t texels_f32_2 = + { + { + vld1q_f32(reinterpret_cast(input.ptr()) + 8), + vld1q_f32(reinterpret_cast(input.ptr()) + 12) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), vqcvtq_qs16_f32(texels_f32_1, fixed_point_position)); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqcvtq_qs16_f32(texels_f32_2, fixed_point_position)); + }, + input, output); + break; + } default: ARM_COMPUTE_ERROR("Output data type not supported"); } diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp index 91fbe6f962..f2cd18d827 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp @@ -94,7 +94,7 @@ void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &wi void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta) { const int fixed_point_position = input->info()->fixed_point_position(); - const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position)); + const qint8x16_t beta_qs8 = vdupq_n_qs8(sqcvt_qs8_f32(beta, fixed_point_position)); Iterator in(input, window); Iterator out(output, window); @@ -118,7 +118,7 @@ void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &wi void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta) { const int fixed_point_position = input->info()->fixed_point_position(); - const qint16x8_t beta_qs16 = vdupq_n_qs16(scvt_qs16_f32(beta, fixed_point_position)); + const qint16x8_t beta_qs16 = vdupq_n_qs16(sqcvt_qs16_f32(beta, fixed_point_position)); Iterator in(input, window); Iterator out(output, window); diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp index b81be6cee9..8381dd8a73 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp @@ -456,7 +456,7 @@ void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT // Multiply by the weight of the matrix product (alpha) if(multiply_alpha) { - const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position)); + const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position)); acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position); acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position); acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position); @@ -585,7 +585,7 @@ void vector_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I // Multiply by the weight of the matrix product (alpha) if(multiply_alpha) { - const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position)); + const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position)); acc00_qs16 = vqmul_qs16(acc00_qs16, alpha_qs16, fixed_point_position); acc01_qs16 = vqmul_qs16(acc01_qs16, alpha_qs16, fixed_point_position); acc02_qs16 = vqmul_qs16(acc02_qs16, alpha_qs16, fixed_point_position); @@ -1058,7 +1058,7 @@ void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, IT const size_t out_stride3 = out_stride1 * 3; const int num_elems_matrix_b_x = input1->info()->dimension(0); const int fixed_point_position = input0->info()->fixed_point_position(); - const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position)); + const qint8x8_t alpha_qs8 = vdup_n_qs8(sqcvt_qs8_f32(alpha, fixed_point_position)); ARM_COMPUTE_UNUSED(alpha_qs8); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix @@ -1291,7 +1291,7 @@ void matrix_matrix_multiply_qs16(const ITensor *input0, const ITensor *input1, I const size_t out_stride3 = out_stride1 * 3; const int num_elems_matrix_b_x = input1->info()->dimension(0); const int fixed_point_position = input0->info()->fixed_point_position(); - const qint16x4_t alpha_qs16 = vdup_n_qs16(scvt_qs16_f32(alpha, fixed_point_position)); + const qint16x4_t alpha_qs16 = vdup_n_qs16(sqcvt_qs16_f32(alpha, fixed_point_position)); ARM_COMPUTE_UNUSED(alpha_qs16); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index 5bb8b1c22a..e4de60df80 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -136,11 +136,11 @@ inline void linearize_volume(const uint8_t *const in_ptr, { if(std::is_same::value) { - *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position); + *out_ptr = sqcvt_qs8_f32(1.0f, fixed_point_position); } else if(std::is_same::value) { - *out_ptr = scvt_qs16_f32(1.0f, fixed_point_position); + *out_ptr = sqcvt_qs16_f32(1.0f, fixed_point_position); } else { @@ -255,11 +255,11 @@ void NEIm2ColKernel::run_reduced(const Window &window) { if(std::is_same::value) { - *(reinterpret_cast(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position()); + *(reinterpret_cast(out_ptr) + out_width - 1) = sqcvt_qs8_f32(1.0f, _input->info()->fixed_point_position()); } else if(std::is_same::value) { - *(reinterpret_cast(out_ptr) + out_width - 1) = scvt_qs16_f32(1.0f, _input->info()->fixed_point_position()); + *(reinterpret_cast(out_ptr) + out_width - 1) = sqcvt_qs16_f32(1.0f, _input->info()->fixed_point_position()); } else { -- cgit v1.2.1