From e8291acc1d9e89c9274d31f0d5bb4779eb95588c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 26 Feb 2020 09:58:13 +0000 Subject: COMPMID-3152: Initial Bfloat16 support Signed-off-by: Georgios Pinitas Change-Id: Ie6959e37e13731c86b2ee29392a99a293450a1b4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2824 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Michalis Spyrou --- .../core/NEON/kernels/NEDepthConvertLayerKernel.h | 25 +++++++++++----------- arm_compute/core/NEON/wrapper/intrinsics/cvt.h | 19 ++++++++++++++++ 2 files changed, 32 insertions(+), 12 deletions(-) (limited to 'arm_compute/core/NEON') diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h index df4102cb86..5cda3203ed 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2019 ARM Limited. + * Copyright (c) 2016-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -55,24 +55,25 @@ public: * Valid conversions Input -> Output : * * - QASYMM8_SIGNED -> S16, S32, F32, F16 - * - QASYMM8 -> U16, S16, S32, F32, F16 - * - U8 -> U16, S16, S32, F32, F16 - * - U16 -> U8, U32 - * - S16 -> QASYMM8_SIGNED, U8, S32 - * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 - * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 - * - F32 -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8 + * - QASYMM8 -> U16, S16, S32, F32, F16 + * - U8 -> U16, S16, S32, F32, F16 + * - U16 -> U8, U32 + * - S16 -> QASYMM8_SIGNED, U8, S32 + * - BFLOAT16 -> F32 + * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 + * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 + * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8 * - * @param[in] input The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/F32. - * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/F16/F32. + * @param[in] input The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32. + * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. * @param[in] policy Conversion policy. * @param[in] shift (Optional) Value for down/up conversions. Must be 0 <= shift < 8. */ void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthConvertLayerKernel * - * @param[in] input Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/F32. - * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/F16/F32. + * @param[in] input Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32. + * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. * @param[in] policy Conversion policy * @param[in] shift (Optional) Value for down/up conversions. Must be 0 <= shift < 8. * diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h index 1f22e09a11..5ea9a5dedd 100644 --- a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h +++ b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h @@ -56,6 +56,25 @@ vcvt(const float32x4_t &a) return vcvtq_s32_f32(a); } +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) +/** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector + * + * @param[in] inptr Pointer to the input memory to load values from + * @param[in,out] outptr Pointer to the output memory to store values to + */ +inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr) +{ + __asm __volatile( + "ldp q0, q1, [%[inptr]]\n" + ".inst 0xea16800\n" // BFCVTN v0, v0 + ".inst 0x4ea16820\n" // BFCVTN2 v0, v1 + "str q0, [%[outptr]]\n" + : [inptr] "+r"(inptr) + : [outptr] "r"(outptr) + : "v0", "v1", "memory"); +} +#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ + } // namespace wrapper } // namespace arm_compute #endif /* ARM_COMPUTE_WRAPPER_CVT_H */ -- cgit v1.2.1