diff options
author | Usama Arif <usama.arif@arm.com> | 2019-05-14 17:10:40 +0100 |
---|---|---|
committer | Usama Arif <usama.arif@arm.com> | 2019-05-23 13:26:03 +0000 |
commit | 9e631c204444e7b095510c54819e944f9be8d342 (patch) | |
tree | a6ebd7168d206cf18c46e1ceef29365024751767 /src/core | |
parent | dd0bf484a3a34dff17757b5e7a4b6be3b1682a29 (diff) | |
download | ComputeLibrary-9e631c204444e7b095510c54819e944f9be8d342.tar.gz |
COMPMID-2252 NECast.
Change-Id: I7532aea6827a325eb8457132d4787ac527e93cd4
Signed-off-by: Usama Arif <usama.arif@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1149
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp | 306 |
1 files changed, 214 insertions, 92 deletions
diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp index cbc90a058f..531873e49e 100644 --- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp @@ -44,16 +44,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, C ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON(input == output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::F16 && output->data_type() != DataType::F32), - "Only data_types supported [in] QASYMM8 -> [out] F16, F32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16 + && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32), + "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16 - && output->data_type() != DataType::S32), - "Only data_types supported [in] U8 -> [out] U16, S16, S32"); + && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32), + "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); @@ -61,11 +62,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, C ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F32), - "Only data_types supported [in] F16 -> [out] QASYMM8, F32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::U8 && output->data_type() != DataType::F32 && output->data_type() != DataType::S32), + "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16), - "Only data_types supported [in] F32 -> [out] QASYMM8, F16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::S32 && output->data_type() != DataType::U8), + "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32 && output->data_type() != DataType::U8), + "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8"); // Validate in case of configured output if(output->total_size() > 0) @@ -139,83 +143,59 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info switch(_input->info()->data_type()) { case DataType::QASYMM8: + case DataType::U8: { + const int16x8_t b = vdupq_n_s16(_shift); + switch(_output->info()->data_type()) { - /* Up-conversion QASYMM8 -> F32 */ - case DataType::F32: + case DataType::S16: { - const float32x4_t scale = vdupq_n_f32(_input->info()->quantization_info().scale); - const int32x4_t offset = vdupq_n_s32(_input->info()->quantization_info().offset); - + /* Up-conversion U8 -> S16 */ execute_window_loop(window, [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); - const uint16x8x2_t texels_u16 = - { - { - vmovl_u8(vget_low_u8(texels_u8)), - vmovl_u8(vget_high_u8(texels_u8)) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); - const int32x4x4_t texels_s32 = + const int16x8x2_t texels = { { - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[0]))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[0]))), - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[1]))), - vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[1]))) + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) } }; - vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[0], offset)), scale)); - vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[1], offset)), scale)); - vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[2], offset)), scale)); - vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[3], offset)), scale)); + vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]); + vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]); }, input, output); break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - /* Up-conversion QASYMM8 -> F16 */ - case DataType::F16: + case DataType::S32: { - const float16x8_t scale = vdupq_n_f16(static_cast<float16_t>(_input->info()->quantization_info().scale)); - const int16x8_t offset = vdupq_n_s16(static_cast<int16_t>(_input->info()->quantization_info().offset)); - + /* Up-conversion U8 -> S32 */ execute_window_loop(window, [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); - const int16x8x2_t texels_s16 = + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const int16x8x2_t texels = { { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) } }; - vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[0], offset)), scale)); - vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[1], offset)), scale)); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1]))); }, input, output); break; } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - ARM_COMPUTE_ERROR("Output data type not supported"); - } - break; - } - case DataType::U8: - { - const int16x8_t b = vdupq_n_s16(_shift); - - switch(_output->info()->data_type()) - { - case DataType::S16: + case DataType::F32: { - /* Up-conversion U8 -> S16 */ + /* Up-conversion U8 -> F32 */ execute_window_loop(window, [&](const Coordinates &) { const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); @@ -227,16 +207,18 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) } }; - - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), texels.val[0]); - vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, texels.val[1]); + vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); }, input, output); break; } - case DataType::S32: +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: { - /* Up-conversion U8 -> S32 */ + /* Up-conversion U8 -> F16 */ execute_window_loop(window, [&](const Coordinates &) { const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); @@ -248,15 +230,14 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) } }; - - vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcvtq_f16_s16(texels.val[0])); + vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcvtq_f16_s16(texels.val[1])); }, input, output); break; } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::U16: { /* Up-conversion U8 -> U16 */ @@ -441,13 +422,11 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info switch(_output->info()->data_type()) { case DataType::QASYMM8: + case DataType::U8: { - const float16x8_t scale = vinvq_f16(vdupq_n_f16(static_cast<float16_t>(_output->info()->quantization_info().scale))); - const int16x8_t offset = vdupq_n_s16(static_cast<int16_t>(_output->info()->quantization_info().offset)); - const int16x8_t max_val_vec = vdupq_n_s16(255); - const int16x8_t zero_val_vec = vdupq_n_s16(0); + const float16x8_t scale = vdupq_n_f16(1 << _shift); - /* Down-conversion F16 -> QASYMM8 */ + /* Up-conversion F16 -> U8 */ execute_window_loop(window, [&](const Coordinates & id) { const float16x8x2_t texels = @@ -458,9 +437,7 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info } }; - const auto texel_quantized_0 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[0]), offset), max_val_vec), zero_val_vec); - const auto texel_quantized_1 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[1]), offset), max_val_vec), zero_val_vec); - vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(texel_quantized_0), vqmovun_s16(texel_quantized_1))); + vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); }, input, output); break; @@ -488,6 +465,29 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info input, output); break; } + case DataType::S32: + { + const float32x4_t scale = vdupq_n_f32(1 << _shift); + + /* Up-conversion F16 -> S32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float16x8x2_t texels = + { + { + vld1q_f16(reinterpret_cast<float16_t *>(input.ptr())), + vld1q_f16(reinterpret_cast<float16_t *>(input.ptr()) + 8) + } + }; + + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale))); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale))); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale))); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale))); + }, + input, output); + break; + } default: ARM_COMPUTE_ERROR("Output data type not supported"); } @@ -496,14 +496,12 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info case DataType::F32: switch(_output->info()->data_type()) { - case DataType::QASYMM8: +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: { - const float32x4_t scale = vinvq_f32(vdupq_n_f32(_output->info()->quantization_info().scale)); - const int32x4_t offset = vdupq_n_s32(_output->info()->quantization_info().offset); - const int32x4_t max_val_vec = vdupq_n_s32(255); - const int32x4_t zero_val_vec = vdupq_n_s32(0); + const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift)); - /* Down-conversion F32 -> QASYMM8 */ + /* Down-conversion F32 -> F16 */ execute_window_loop(window, [&](const Coordinates &) { const float32x4x4_t texels = @@ -516,25 +514,44 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info } }; - const auto texel_quantized_0 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[0]), offset), max_val_vec), zero_val_vec); - const auto texel_quantized_1 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[1]), offset), max_val_vec), zero_val_vec); - const auto texel_quantized_2 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[2]), offset), max_val_vec), zero_val_vec); - const auto texel_quantized_3 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[3]), offset), max_val_vec), zero_val_vec); + vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + }, + input, output); + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::S32: + { + const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift)); - const auto converted_0 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_0), vqmovun_s32(texel_quantized_1))); - const auto converted_1 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_2), vqmovun_s32(texel_quantized_3))); + /* Conversion F32 -> S32 */ + execute_window_loop(window, [&](const Coordinates &) + { + const float32x4x4_t texels = + { + { + vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale), + vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale), + vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale), + vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale), + } + }; - vst1q_u8(reinterpret_cast<uint8_t *>(output.ptr()), vcombine_u8(converted_0, converted_1)); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()), vcvtq_s32_f32(texels.val[0])); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 4, vcvtq_s32_f32(texels.val[1])); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 8, vcvtq_s32_f32(texels.val[2])); + vst1q_s32(reinterpret_cast<int32_t *>(output.ptr()) + 12, vcvtq_s32_f32(texels.val[3])); }, input, output); break; } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: + case DataType::QASYMM8: + case DataType::U8: { const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift)); - /* Down-conversion F32 -> F16 */ + /* Down-conversion F32 -> U8 */ execute_window_loop(window, [&](const Coordinates &) { const float32x4x4_t texels = @@ -543,7 +560,41 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr())), scale), vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 4), scale), vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 8), scale), - vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale) + vmulq_f32(vld1q_f32(reinterpret_cast<float *>(input.ptr()) + 12), scale), + } + }; + + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); + }, + input, output); + break; + } + + + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + + case DataType::S32: + switch(_output->info()->data_type()) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift)); + + /* Down-conversion S32 -> F16 */ + execute_window_loop(window, [&](const Coordinates &) + { + const float32x4x4_t texels = + { + { + vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()))), scale), + vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4)), scale), + vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8)), scale), + vmulq_f32(vcvtq_f32_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12)), scale) } }; @@ -554,6 +605,77 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info break; } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + const int32x4_t scale = vdupq_n_s32(1.f / (1 << _shift)); + + /* Conversion S32 -> F32 */ + execute_window_loop(window, [&](const Coordinates &) + { + const int32x4x4_t texels = + { + { + vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), scale), + vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), scale), + vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), scale), + vmulq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), scale), + } + }; + + vst1q_f32(reinterpret_cast<float *>(output.ptr()), vcvtq_f32_s32(texels.val[0])); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, vcvtq_f32_s32(texels.val[1])); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, vcvtq_f32_s32(texels.val[2])); + vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, vcvtq_f32_s32(texels.val[3])); + }, + input, output); + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift)); + + /* Down-conversion S32 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(window, [&](const Coordinates &) + { + const int32x4x4_t texels = + { + { + vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b), + vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b), + vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b), + vqshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b) + } + }; + + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1])))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3])))); + }, + input, output); + } + else + { + execute_window_loop(window, [&](const Coordinates &) + { + const int32x4x4_t texels = + { + { + vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr())), b), + vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 4), b), + vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 8), b), + vshlq_s32(vld1q_s32(reinterpret_cast<int32_t *>(input.ptr()) + 12), b) + } + }; + + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr()), vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr())+8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); + }, + input, output); + } + break; + } default: ARM_COMPUTE_ERROR("Output data type not supported"); } |