From e2588184240b4850f62859ca9f3c5e95c9d8e129 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 13 Dec 2018 18:31:18 +0000 Subject: COMPMID-1755 NEON: Extend DepthConvert to support Cast Change-Id: I8e2ed9e97cbe86d8caf162bd84ecfd9b43b0bd3b Reviewed-on: https://review.mlplatform.org/401 Tested-by: Arm Jenkins Reviewed-by: Giuseppe Rossini Reviewed-by: Georgios Pinitas --- .../NEON/kernels/NEDepthConvertLayerKernel.cpp | 147 ++++++++++++++++++++- 1 file changed, 140 insertions(+), 7 deletions(-) (limited to 'src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp') diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp index 158f401084..54337551a7 100644 --- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NEMath.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" @@ -43,10 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, C ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON(input == output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QASYMM8 && (output->data_type() != DataType::F16 && output->data_type() != DataType::F32), + "Only data_types supported [in] QASYMM8 -> [out] F16, F32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16 && output->data_type() != DataType::S32), "Only data_types supported [in] U8 -> [out] U16, S16, S32"); @@ -57,11 +61,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, C ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32, - "Only data_types supported [in] F16 -> [out] F32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F32), + "Only data_types supported [in] F16 -> [out] QASYMM8, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16, - "Only data_types supported [in] F32 -> [out] F16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && (output->data_type() != DataType::QASYMM8 && output->data_type() != DataType::F16), + "Only data_types supported [in] F32 -> [out] QASYMM8, F16"); // Validate in case of configured output if(output->total_size() > 0) @@ -134,6 +138,75 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info switch(_input->info()->data_type()) { + case DataType::QASYMM8: + { + switch(_output->info()->data_type()) + { + /* Up-conversion QASYMM8 -> F32 */ + case DataType::F32: + { + const float32x4_t scale = vdupq_n_f32(_input->info()->quantization_info().scale); + const int32x4_t offset = vdupq_n_s32(_input->info()->quantization_info().offset); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + const uint16x8x2_t texels_u16 = + { + { + vmovl_u8(vget_low_u8(texels_u8)), + vmovl_u8(vget_high_u8(texels_u8)) + } + }; + + const int32x4x4_t texels_s32 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[0]))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[0]))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(texels_u16.val[1]))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(texels_u16.val[1]))) + } + }; + + vst1q_f32(reinterpret_cast(output.ptr()), vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[0], offset)), scale)); + vst1q_f32(reinterpret_cast(output.ptr()) + 4, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[1], offset)), scale)); + vst1q_f32(reinterpret_cast(output.ptr()) + 8, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[2], offset)), scale)); + vst1q_f32(reinterpret_cast(output.ptr()) + 12, vmulq_f32(vcvtq_f32_s32(vsubq_s32(texels_s32.val[3], offset)), scale)); + }, + input, output); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + /* Up-conversion QASYMM8 -> F16 */ + case DataType::F16: + { + const float16x8_t scale = vdupq_n_f16(static_cast(_input->info()->quantization_info().scale)); + const int16x8_t offset = vdupq_n_s16(static_cast(_input->info()->quantization_info().offset)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + const int16x8x2_t texels_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) + } + }; + + vst1q_f16(reinterpret_cast(output.ptr()), vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[0], offset)), scale)); + vst1q_f16(reinterpret_cast(output.ptr()) + 8, vmulq_f16(vcvtq_f16_s16(vsubq_s16(texels_s16.val[1], offset)), scale)); + }, + input, output); + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } case DataType::U8: { const int16x8_t b = vdupq_n_s16(_shift); @@ -367,6 +440,31 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info case DataType::F16: switch(_output->info()->data_type()) { + case DataType::QASYMM8: + { + const float16x8_t scale = vinvq_f16(vdupq_n_f16(static_cast(_output->info()->quantization_info().scale))); + const int16x8_t offset = vdupq_n_s16(static_cast(_output->info()->quantization_info().offset)); + const int16x8_t max_val_vec = vdupq_n_s16(255); + const int16x8_t zero_val_vec = vdupq_n_s16(0); + + /* Down-conversion F16 -> QASYMM8 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float16x8x2_t texels = + { + { + vmulq_f16(vld1q_f16(reinterpret_cast(input.ptr())), scale), + vmulq_f16(vld1q_f16(reinterpret_cast(input.ptr()) + 8), scale), + } + }; + + const auto texel_quantized_0 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[0]), offset), max_val_vec), zero_val_vec); + const auto texel_quantized_1 = vmaxq_s16(vminq_s16(vaddq_s16(vcvtq_s16_f16(texels.val[1]), offset), max_val_vec), zero_val_vec); + vst1q_u8(reinterpret_cast(output.ptr()), vcombine_u8(vqmovun_s16(texel_quantized_0), vqmovun_s16(texel_quantized_1))); + }, + input, output); + break; + } case DataType::F32: { const float32x4_t scale = vdupq_n_f32(1 << _shift); @@ -394,9 +492,44 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR("Output data type not supported"); } break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::F32: switch(_output->info()->data_type()) { + case DataType::QASYMM8: + { + const float32x4_t scale = vinvq_f32(vdupq_n_f32(_output->info()->quantization_info().scale)); + const int32x4_t offset = vdupq_n_s32(_output->info()->quantization_info().offset); + const int32x4_t max_val_vec = vdupq_n_s32(255); + const int32x4_t zero_val_vec = vdupq_n_s32(0); + + /* Down-conversion F32 -> QASYMM8 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t texels = + { + { + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr())), scale), + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr()) + 4), scale), + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr()) + 8), scale), + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr()) + 12), scale) + } + }; + + const auto texel_quantized_0 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[0]), offset), max_val_vec), zero_val_vec); + const auto texel_quantized_1 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[1]), offset), max_val_vec), zero_val_vec); + const auto texel_quantized_2 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[2]), offset), max_val_vec), zero_val_vec); + const auto texel_quantized_3 = vmaxq_s32(vminq_s32(vaddq_s32(vcvtq_s32_f32(texels.val[3]), offset), max_val_vec), zero_val_vec); + + const auto converted_0 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_0), vqmovun_s32(texel_quantized_1))); + const auto converted_1 = vqmovn_u16(vcombine_u16(vqmovun_s32(texel_quantized_2), vqmovun_s32(texel_quantized_3))); + + vst1q_u8(reinterpret_cast(output.ptr()), vcombine_u8(converted_0, converted_1)); + }, + input, output); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift)); @@ -420,11 +553,11 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info input, output); break; } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: ARM_COMPUTE_ERROR("Output data type not supported"); } break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: ARM_COMPUTE_ERROR("Not supported"); } -- cgit v1.2.1