diff options
author | Narumol Prangnawarat <narumol.prangnawarat@arm.com> | 2020-03-05 17:27:45 +0000 |
---|---|---|
committer | Narumol Prangnawarat <narumol.prangnawarat@arm.com> | 2020-03-06 13:52:28 +0000 |
commit | e66448491b836049df62e63e1e5151eefe3bfcf8 (patch) | |
tree | 222fbe238f3b594015728fef1f4ba98fc847f3c7 /src/armnnUtils | |
parent | e32c8440e4f777e48cff4f7a09bdac6f76ad773d (diff) | |
download | armnn-e66448491b836049df62e63e1e5151eefe3bfcf8.tar.gz |
IVGCVSW-4517 Add BFloat16 class and unit tests
Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: Ie2e9e617b9210d79a26e7ba58ecc874d1202e599
Diffstat (limited to 'src/armnnUtils')
-rw-r--r-- | src/armnnUtils/BFloat16.hpp | 110 |
1 files changed, 109 insertions, 1 deletions
diff --git a/src/armnnUtils/BFloat16.hpp b/src/armnnUtils/BFloat16.hpp index bce45aa1ff..bb56b7d37c 100644 --- a/src/armnnUtils/BFloat16.hpp +++ b/src/armnnUtils/BFloat16.hpp @@ -5,9 +5,117 @@ #pragma once +#include <ostream> +#include <math.h> #include <stdint.h> namespace armnn { - using BFloat16 = uint16_t; +class BFloat16 +{ +public: + BFloat16() + : value(0) + {} + + explicit BFloat16(uint16_t v) + : value(v) + {} + + explicit BFloat16(float v) + { + value = float32ToBFloat16(v).val(); + } + + BFloat16& operator=(float v) + { + value = float32ToBFloat16(v).val(); + return *this; + } + + bool operator==(const BFloat16& r) const + { + return value == r.val(); + } + + bool operator==(const float& r) const + { + return toFloat32() == r; + } + + static BFloat16 float32ToBFloat16(const float v) + { + if (std::isnan(v)) + { + return nan(); + } + else + { + // Round value to the nearest even + // Float32 + // S EEEEEEEE MMMMMMLRMMMMMMMMMMMMMMM + // BFloat16 + // S EEEEEEEE MMMMMML + // LSB (L): Least significat bit of BFloat16 (last bit of the Mantissa of BFloat16) + // R: Rounding bit + // LSB = 0, R = 0 -> round down + // LSB = 1, R = 0 -> round down + // LSB = 0, R = 1, all the rest = 0 -> round down + // LSB = 1, R = 1 -> round up + // LSB = 0, R = 1 -> round up + const uint32_t* u32 = reinterpret_cast<const uint32_t*>(&v); + uint16_t u16 = static_cast<uint16_t>(*u32 >> 16u); + // Mark the LSB + const uint16_t lsb = u16 & 0x0001; + // Mark the error to be truncate (the rest of 16 bits of FP32) + const uint16_t error = static_cast<const uint16_t>((*u32 & 0x0000FFFF)); + if ((error > 0x8000 || (error == 0x8000 && lsb == 1))) + { + u16++; + } + BFloat16 b(u16); + return b; + } + } + + float toFloat32() const + { + const uint32_t u32 = static_cast<const uint32_t>(value << 16u); + const float* f32 = reinterpret_cast<const float*>(&u32); + return *f32; + } + + uint16_t val() const + { + return value; + } + + static BFloat16 max() + { + uint16_t max = 0x7F7F; + return BFloat16(max); + } + + static BFloat16 nan() + { + uint16_t nan = 0x7FC0; + return BFloat16(nan); + } + + static BFloat16 inf() + { + uint16_t infVal = 0x7F80; + return BFloat16(infVal); + } + +private: + uint16_t value; +}; + +inline std::ostream& operator<<(std::ostream& os, const BFloat16& b) +{ + os << b.toFloat32() << "(0x" << std::hex << b.val() << ")"; + return os; +} + } //namespace armnn |