ArmNN
 23.08
FloatingPointConverter.cpp
Go to the documentation of this file.
1 //
2 // Copyright © 2017 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
7 
8 #include "BFloat16.hpp"
9 #include "Half.hpp"
10 
11 #include <armnn/utility/Assert.hpp>
12 
13 namespace armnnUtils
14 {
15 
16 void FloatingPointConverter::ConvertFloat32To16(const float* srcFloat32Buffer,
17  size_t numElements,
18  void* dstFloat16Buffer)
19 {
20  ARMNN_ASSERT(srcFloat32Buffer != nullptr);
21  ARMNN_ASSERT(dstFloat16Buffer != nullptr);
22 
23  armnn::Half* pHalf = static_cast<armnn::Half*>(dstFloat16Buffer);
24 
25  for (size_t i = 0; i < numElements; i++)
26  {
27  pHalf[i] = armnn::Half(srcFloat32Buffer[i]);
28  if (isinf(pHalf[i]))
29  {
30  // If the value of converted Fp16 is infinity, round to the closest finite Fp16 value.
31  pHalf[i] = copysign(std::numeric_limits<armnn::Half>::max(), pHalf[i]);
32  }
33  }
34 }
35 
36 void FloatingPointConverter::ConvertFloat16To32(const void* srcFloat16Buffer,
37  size_t numElements,
38  float* dstFloat32Buffer)
39 {
40  ARMNN_ASSERT(srcFloat16Buffer != nullptr);
41  ARMNN_ASSERT(dstFloat32Buffer != nullptr);
42 
43  const armnn::Half* pHalf = static_cast<const armnn::Half*>(srcFloat16Buffer);
44 
45  for (size_t i = 0; i < numElements; i++)
46  {
47  dstFloat32Buffer[i] = pHalf[i];
48  }
49 }
50 
51 } //namespace armnnUtils
ARMNN_ASSERT
#define ARMNN_ASSERT(COND)
Definition: Assert.hpp:14
armnn::Half
half_float::half Half
Definition: Half.hpp:22
Assert.hpp
armnnUtils::FloatingPointConverter::ConvertFloat16To32
static void ConvertFloat16To32(const void *srcFloat16Buffer, size_t numElements, float *dstFloat32Buffer)
Definition: FloatingPointConverter.cpp:36
armnnUtils
Definition: CompatibleTypes.hpp:10
Half.hpp
armnnUtils::FloatingPointConverter::ConvertFloat32To16
static void ConvertFloat32To16(const float *srcFloat32Buffer, size_t numElements, void *dstFloat16Buffer)
Converts a buffer of FP32 values to FP16, and stores in the given dstFloat16Buffer.
Definition: FloatingPointConverter.cpp:16
FloatingPointConverter.hpp
BFloat16.hpp