diff options
Diffstat (limited to 'chapters/introduction.adoc')
-rw-r--r-- | chapters/introduction.adoc | 53 |
1 files changed, 40 insertions, 13 deletions
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc index d6f7bf9..26fef0e 100644 --- a/chapters/introduction.adoc +++ b/chapters/introduction.adoc @@ -338,11 +338,11 @@ fp64_t acc_prec; // 1<<(M+1) where M is the number of mantissa bits fp64_t acc_min_normal; // accumulator minimum normal greater than zero fp64_t two_m63 = -1.0/(fp64)((int64_t)-1<<63); // pow(2, -63) switch (acc_t) { - case fp32_t: acc_prec = (fp64_t)(1<<24); // pow(2, 24) + case fp32_t: acc_prec = static_cast<fp64_t>(1<<24); // pow(2, 24) acc_min_normal = two_m63 * two_m63; // pow(2, -126) break; - case fp16_t: acc_prec = (fp64_t)(1<<11); // pow(2, 11) - acc_min_normal = 1.0/(fp64_t)(1<<14); // pow(2, -14) + case fp16_t: acc_prec = static_cast<fp64_t>(1<<11); // pow(2, 11) + acc_min_normal = 1.0/static_cast<fp64_t>(1<<14); // pow(2, -14) break; default: ERROR_IF(true); } @@ -359,7 +359,7 @@ for_each(index in output_shape) { out_err = 0.0; } else { // 0.0 < out_bnd < infinity out_bnd = max(out_bnd, acc_min_normal); - out_err = ((fp64_t)out_imp - out_ref)*acc_prec/out_bnd; + out_err = (static_cast<fp64_t>(out_imp) - out_ref) * acc_prec / out_bnd; REQUIRE(abs(out_err) <= ksb); } out_err_sum += out_err; @@ -457,11 +457,21 @@ The number formats supported by a given operator are listed in its table of supp | - |Boolean value. Size implementation defined. The TOSA reference model implements this as int8_t with 0 for false and 1 for true. All non-zero values are accepted on input as true. +|i4_t +| - +| - +|Signless 4-bit integer type. Will be interpreted as int4_t by all operators + |int4_t | -7 | +7 |Signed 4-bit two's-complement value. Excludes -8 to maintain a symmetric about zero range for weights. +|i8_t +| - +| - +|Signless 8-bit integer value. Will be interpreted as int8_t unless otherwise specified by an operator. + |int8_t | -128 | +127 @@ -470,7 +480,12 @@ The number formats supported by a given operator are listed in its table of supp |uint8_t | 0 | 255 -|Unsigned 8-bit value. +|Unsigned 8-bit integer value. + +|i16_t +| - +| - +|Signless 16-bit integer type. Will be interpreted as int16_t unless otherwise specified by an operator. |int16_t | -32768 @@ -482,11 +497,21 @@ The number formats supported by a given operator are listed in its table of supp | 65535 |Unsigned 16-bit value. +|i32_t +| - +| - +|Signless 32-bit integer value. Will be interpreted as int32_t by all operators. + |int32_t | -(1<<31) | (1<<31)-1 |Signed 32-bit two's-complement value. +|i48_t +| - +| - +|Signless 32-bit integer value. Will be interpreted as int48_t by all operators. + |int48_t | -(1<<47) | (1<<47)-1 @@ -542,7 +567,9 @@ This ensures that a Base Inference profile TOSA implementation can calculate the === Integer Behavior -Integer calculations must be standard two's-complement or unsigned calculations. +TOSA integer inputs and outputs are specified by signless values with the given number of bits. +Unless otherwise specified, these values will be interpreted as signed twos-complement. +The pseudocode will use int*_t to indicate use as a signed value and uint*_t to indicate use as an unsigned value. If overflow occurs doing integer calculation, the result is unpredictable, as indicated by the REQUIRE checks in the pseudocode for the operators. Unsigned 8 and 16-bit values are only allowed in the RESCALE operation, to allow for compatibility with networks which expect unsigned 8-bit or 16-bit tensors for input and output. @@ -598,20 +625,20 @@ int32_t apply_scale_32(int32_t value, int32_t multiplier, int8_t shift, bool_t d if (shift > 31 && value >= 0) round += 1<<30; if (shift > 31 && value < 0) round -= 1<<30; } - int64_t result = (int64_t)value * multiplier + round; + int64_t result = static_cast<int64_t>(value) * multiplier + round; result = result >> shift; // result will fit a 32-bit range due to the REQUIRE on value - return (int32_t)result; + return static_cast<int32_t>(result); } int32_t apply_scale_16(int48_t value, int16_t multipler, int8_t shift) { REQUIRE(multiplier >= 0); REQUIRE(2 <= shift && shift <= 62); int64_t round = (1 << (shift - 1)); - int64_t result = (int64_t)value * multiplier + round; + int64_t result = static_cast<int64_t>(value) * multiplier + round; result = result >> shift; REQUIRE(result >= minimum<int32_t> && result <= maximum<int32_t>); - return (int32_t)result; + return static_cast<int32_t>(result); } ---- @@ -665,9 +692,9 @@ All table lookups are based on the following reference lookup function that take [source,c++] ---- -int32_t apply_lookup(int16_t *table, int32_t value) +int32_t apply_lookup_s(int16_t *table, int32_t value) { - int16_t clipped_value = (int16_t)apply_clip<int32_t>(value, -32768, +32767); + int16_t clipped_value = static_cast<int16_t>(apply_clip_s<int32_t>(value, -32768, +32767)); int32_t index = (clipped_value + 32768) >> 7; int32_t fraction = clipped_value & 0x7f; int16_t base = table[index]; @@ -688,7 +715,7 @@ void generate_lookup_table(int16_t *table, int32_t (*reference)(int32_t)) { for (int i = -256; i <= 256; i++) { int32_t value = (*reference)(i); - table[i + 256] = (int16_t)apply_clip<int32_t>(value, -32768, +32767) + table[i + 256] = static_cast<int16_t>(apply_clip<int32_t>(value, -32768, +32767)); } } ---- |