pseudocode/library/numeric_accuracy_helpers.tosac


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

//
// This confidential and proprietary software may be used only as
// authorised by a licensing agreement from ARM Limited
// (C) COPYRIGHT 2020-2024 ARM Limited
// ALL RIGHTS RESERVED
// The entire notice above must be reproduced on all authorised
// copies and copies may only be made to the extent permitted
// by a licensing agreement from ARM Limited.

fp64_t exp2(int n) {
    if (n < -1075) {
        return 0.0; // smaller than smallest denormal
    }
    REQUIRE(n <= 1023);
    fp64_t v = 1.0;
    while (n > 0) { v = v*2.0; n--; }
    while (n < 0) { v = v/2.0; n++; }
    return v;
}

int ilog2(fp64_t v) {
    REQURE(0 < v && v < infinity);
    int n = 0;
    while (v >= 2.0) { v = v/2.0; n++; }
    while (v <  1.0) { v = v*2.0; n--; }
    return n;
}

fp64_t normal_min<in_t>() {
  switch (in_t) {
    case fp32_t: return exp2(-126);
    case bf16_t: return exp2(-126);
    case fp16_t: return exp2( -14);
    case fp8e4m3_t: return exp2(-6);
    case fp8e5m2_t: return exp2(-14);
  }
}

fp64_t normal_max<in_t>() {
  switch (in_t) {
    case fp32_t: return exp2(128) - exp2(127-23);
    case bf16_t: return exp2(128) - exp2(127- 7);
    case fp16_t: return exp2( 16) - exp2( 15-10);
    case fp8e4m3_t: return exp2( 9) - exp2( 8-2);
    case fp8e5m2_t: return exp2( 16) - exp2( 15-2);
  }
}

// Number of fractional (mantissa bits)
int normal_frac<in_t> () {
  switch (in_t) {
    case fp32_t: return 23;
    case fp16_t: return 10;
    case bf16_t: return  7;
    case fp8e4m3_t: return 3;
    case fp8e5m2_t: return 2;
  }
}

double calcAbsErrorBound<in_t>(double bound_magnitude, double bounds_value,
                               double lower_bound, double normal_divisor) {
    double error_bound = 0.0;
    // Avoid cases where we generate an error_bound of NaN by multiplying inf * 0
    if (is_finite(bounds_value) || abs(bound_magnitude) != 0.0) {
      double value_bound = abs(bound_magnitude) * bounds_value;
      if (lower_bound > 0) {
        value_bound = max(lower_bound, value_bound);
      }
      error_bound = exp2(-normal_frac<in_t> / normal_divisor) * value_bound;
    }
    return error_bound;
}