aboutsummaryrefslogtreecommitdiff
path: root/src/armnn/test/FloatingPointConverterTest.cpp
blob: 21a16a3cc08a053be39566443203946764c6d4de (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
//
// Copyright © 2017 Arm Ltd. All rights reserved.
// SPDX-License-Identifier: MIT
//

#include <armnnUtils/FloatingPointConverter.hpp>

#include <BFloat16.hpp>
#include <Half.hpp>

#include <vector>

#include <doctest/doctest.h>

TEST_SUITE("TestFPConversion")
{
TEST_CASE("TestConvertFp32ToFp16")
{
    using namespace half_float::literal;

    float floatArray[] = { 1.0f, 2.0f, 0.5f, 3.1f, 2.4f,
                           5.666f, 6.444f, 7.1f, 432.121f, 12.22f };
    size_t numFloats = sizeof(floatArray) / sizeof(floatArray[0]);
    std::vector<armnn::Half> convertedBuffer(numFloats, 0.0_h);

    armnnUtils::FloatingPointConverter::ConvertFloat32To16(floatArray, numFloats, convertedBuffer.data());

    for (size_t i = 0; i < numFloats; i++)
    {
        armnn::Half expected(floatArray[i]);
        armnn::Half actual = convertedBuffer[i];
        CHECK_EQ(expected, actual);

        float convertedHalf = actual;
        CHECK_EQ(floatArray[i], doctest::Approx(convertedHalf).epsilon(0.07));
    }
}

TEST_CASE("TestConvertFp16ToFp32")
{
    using namespace half_float::literal;

    armnn::Half halfArray[] = { 1.0_h, 2.0_h, 0.5_h, 3.1_h, 2.4_h,
                                5.666_h, 6.444_h, 7.1_h, 432.121_h, 12.22_h };
    size_t numFloats = sizeof(halfArray) / sizeof(halfArray[0]);
    std::vector<float> convertedBuffer(numFloats, 0.0f);

    armnnUtils::FloatingPointConverter::ConvertFloat16To32(halfArray, numFloats, convertedBuffer.data());

    for (size_t i = 0; i < numFloats; i++)
    {
        float expected(halfArray[i]);
        float actual = convertedBuffer[i];
        CHECK_EQ(expected, actual);
    }
}

TEST_CASE("TestConvertFloat32ToBFloat16")
{
    float floatArray[] = { 1.704735E38f,   // 0x7F004000 round down
                           0.0f,           // 0x00000000 round down
                           2.2959E-41f,    // 0x00004000 round down
                           1.7180272E38f,  // 0x7F014000 round down
                           9.18355E-41f,   // 0x00010000 round down
                           1.14794E-40f,   // 0x00014000 round down
                           4.5918E-41f,    // 0x00008000 round down
                           -1.708058E38f,  // 0xFF008000 round down
                           -4.3033756E37f, // 0xFE018000 round up
                           1.60712E-40f,   // 0x0001C000 round up
                           -2.0234377f,    // 0xC0018001 round up
                           -1.1800863E-38f,// 0x80808001 round up
                           4.843037E-35f,  // 0x0680C000 round up
                           3.9999998f,     // 0x407FFFFF round up
                           std::numeric_limits<float>::max(),    // 0x7F7FFFFF max positive value
                           std::numeric_limits<float>::lowest(), // 0xFF7FFFFF max negative value
                           1.1754942E-38f, // 0x007FFFFF min positive value
                           -1.1754942E-38f // 0x807FFFFF min negative value
                          };
    uint16_t expectedResult[] = { 0x7F00,
                                  0x0000,
                                  0x0000,
                                  0x7F01,
                                  0x0001,
                                  0x0001,
                                  0x0000,
                                  0xFF00,
                                  0xFE02,
                                  0x0002,
                                  0xC002,
                                  0x8081,
                                  0x0681,
                                  0x4080,
                                  0x7F80,
                                  0xFF80,
                                  0x0080,
                                  0x8080
                                 };
    size_t numFloats = sizeof(floatArray) / sizeof(floatArray[0]);

    std::vector<armnn::BFloat16> convertedBuffer(numFloats);

    armnnUtils::FloatingPointConverter::ConvertFloat32ToBFloat16(floatArray, numFloats, convertedBuffer.data());

    for (size_t i = 0; i < numFloats; i++)
    {
        armnn::BFloat16 actual = convertedBuffer[i];
        CHECK_EQ(expectedResult[i], actual.Val());
    }
}

TEST_CASE("TestConvertBFloat16ToFloat32")
{
    uint16_t bf16Array[] = { 16256, 16320, 38699, 16384, 49156, 32639 };
    size_t numFloats = sizeof(bf16Array) / sizeof(bf16Array[0]);
    float expectedResult[] = { 1.0f, 1.5f, -5.525308E-25f, 2.0f, -2.0625f, 3.3895314E38f };
    std::vector<float> convertedBuffer(numFloats, 0.0f);

    armnnUtils::FloatingPointConverter::ConvertBFloat16ToFloat32(bf16Array, numFloats, convertedBuffer.data());

    for (size_t i = 0; i < numFloats; i++)
    {
        float actual = convertedBuffer[i];
        CHECK_EQ(expectedResult[i], actual);
    }
}

}