17 TEST_CASE(
"TestConvertFp32ToFp16")
19 using namespace half_float::literal;
21 float floatArray[] = { 1.0f, 2.0f, 0.5f, 3.1f, 2.4f,
22 5.666f, 6.444f, 7.1f, 432.121f, 12.22f };
23 size_t numFloats =
sizeof(floatArray) /
sizeof(floatArray[0]);
24 std::vector<armnn::Half> convertedBuffer(numFloats, 0.0_h);
28 for (
size_t i = 0; i < numFloats; i++)
32 CHECK_EQ(expected, actual);
34 float convertedHalf = actual;
35 CHECK_EQ(floatArray[i], doctest::Approx(convertedHalf).epsilon(0.07));
39 TEST_CASE(
"TestConvertFp16ToFp32")
41 using namespace half_float::literal;
43 armnn::Half halfArray[] = { 1.0_h, 2.0_h, 0.5_h, 3.1_h, 2.4_h,
44 5.666_h, 6.444_h, 7.1_h, 432.121_h, 12.22_h };
45 size_t numFloats =
sizeof(halfArray) /
sizeof(halfArray[0]);
46 std::vector<float> convertedBuffer(numFloats, 0.0f);
50 for (
size_t i = 0; i < numFloats; i++)
52 float expected(halfArray[i]);
53 float actual = convertedBuffer[i];
54 CHECK_EQ(expected, actual);
58 TEST_CASE(
"TestConvertFloat32ToBFloat16")
60 float floatArray[] = { 1.704735E38f,
74 std::numeric_limits<float>::max(),
75 std::numeric_limits<float>::lowest(),
79 uint16_t expectedResult[] = { 0x7F00,
98 size_t numFloats =
sizeof(floatArray) /
sizeof(floatArray[0]);
100 std::vector<armnn::BFloat16> convertedBuffer(numFloats);
104 for (
size_t i = 0; i < numFloats; i++)
107 CHECK_EQ(expectedResult[i], actual.
Val());
111 TEST_CASE(
"TestConvertBFloat16ToFloat32")
113 uint16_t bf16Array[] = { 16256, 16320, 38699, 16384, 49156, 32639 };
114 size_t numFloats =
sizeof(bf16Array) /
sizeof(bf16Array[0]);
115 float expectedResult[] = { 1.0f, 1.5f, -5.525308E-25f, 2.0f, -2.0625f, 3.3895314E38f };
116 std::vector<float> convertedBuffer(numFloats, 0.0f);
120 for (
size_t i = 0; i < numFloats; i++)
122 float actual = convertedBuffer[i];
123 CHECK_EQ(expectedResult[i], actual);
static void ConvertBFloat16ToFloat32(const void *srcBFloat16Buffer, size_t numElements, float *dstFloat32Buffer)
static void ConvertFloat32To16(const float *srcFloat32Buffer, size_t numElements, void *dstFloat16Buffer)
Converts a buffer of FP32 values to FP16, and stores in the given dstFloat16Buffer.
static void ConvertFloat16To32(const void *srcFloat16Buffer, size_t numElements, float *dstFloat32Buffer)
static void ConvertFloat32ToBFloat16(const float *srcFloat32Buffer, size_t numElements, void *dstBFloat16Buffer)