aboutsummaryrefslogtreecommitdiff
path: root/chapters/introduction.adoc
diff options
context:
space:
mode:
Diffstat (limited to 'chapters/introduction.adoc')
-rw-r--r--chapters/introduction.adoc53
1 files changed, 40 insertions, 13 deletions
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index d6f7bf9..26fef0e 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -338,11 +338,11 @@ fp64_t acc_prec; // 1<<(M+1) where M is the number of mantissa bits
fp64_t acc_min_normal; // accumulator minimum normal greater than zero
fp64_t two_m63 = -1.0/(fp64)((int64_t)-1<<63); // pow(2, -63)
switch (acc_t) {
- case fp32_t: acc_prec = (fp64_t)(1<<24); // pow(2, 24)
+ case fp32_t: acc_prec = static_cast<fp64_t>(1<<24); // pow(2, 24)
acc_min_normal = two_m63 * two_m63; // pow(2, -126)
break;
- case fp16_t: acc_prec = (fp64_t)(1<<11); // pow(2, 11)
- acc_min_normal = 1.0/(fp64_t)(1<<14); // pow(2, -14)
+ case fp16_t: acc_prec = static_cast<fp64_t>(1<<11); // pow(2, 11)
+ acc_min_normal = 1.0/static_cast<fp64_t>(1<<14); // pow(2, -14)
break;
default: ERROR_IF(true);
}
@@ -359,7 +359,7 @@ for_each(index in output_shape) {
out_err = 0.0;
} else { // 0.0 < out_bnd < infinity
out_bnd = max(out_bnd, acc_min_normal);
- out_err = ((fp64_t)out_imp - out_ref)*acc_prec/out_bnd;
+ out_err = (static_cast<fp64_t>(out_imp) - out_ref) * acc_prec / out_bnd;
REQUIRE(abs(out_err) <= ksb);
}
out_err_sum += out_err;
@@ -457,11 +457,21 @@ The number formats supported by a given operator are listed in its table of supp
| -
|Boolean value. Size implementation defined. The TOSA reference model implements this as int8_t with 0 for false and 1 for true. All non-zero values are accepted on input as true.
+|i4_t
+| -
+| -
+|Signless 4-bit integer type. Will be interpreted as int4_t by all operators
+
|int4_t
| -7
| +7
|Signed 4-bit two's-complement value. Excludes -8 to maintain a symmetric about zero range for weights.
+|i8_t
+| -
+| -
+|Signless 8-bit integer value. Will be interpreted as int8_t unless otherwise specified by an operator.
+
|int8_t
| -128
| +127
@@ -470,7 +480,12 @@ The number formats supported by a given operator are listed in its table of supp
|uint8_t
| 0
| 255
-|Unsigned 8-bit value.
+|Unsigned 8-bit integer value.
+
+|i16_t
+| -
+| -
+|Signless 16-bit integer type. Will be interpreted as int16_t unless otherwise specified by an operator.
|int16_t
| -32768
@@ -482,11 +497,21 @@ The number formats supported by a given operator are listed in its table of supp
| 65535
|Unsigned 16-bit value.
+|i32_t
+| -
+| -
+|Signless 32-bit integer value. Will be interpreted as int32_t by all operators.
+
|int32_t
| -(1<<31)
| (1<<31)-1
|Signed 32-bit two's-complement value.
+|i48_t
+| -
+| -
+|Signless 32-bit integer value. Will be interpreted as int48_t by all operators.
+
|int48_t
| -(1<<47)
| (1<<47)-1
@@ -542,7 +567,9 @@ This ensures that a Base Inference profile TOSA implementation can calculate the
=== Integer Behavior
-Integer calculations must be standard two's-complement or unsigned calculations.
+TOSA integer inputs and outputs are specified by signless values with the given number of bits.
+Unless otherwise specified, these values will be interpreted as signed twos-complement.
+The pseudocode will use int*_t to indicate use as a signed value and uint*_t to indicate use as an unsigned value.
If overflow occurs doing integer calculation, the result is unpredictable, as indicated by the REQUIRE checks in the pseudocode for the operators.
Unsigned 8 and 16-bit values are only allowed in the RESCALE operation, to allow for compatibility with networks which expect unsigned 8-bit or 16-bit tensors for input and output.
@@ -598,20 +625,20 @@ int32_t apply_scale_32(int32_t value, int32_t multiplier, int8_t shift, bool_t d
if (shift > 31 && value >= 0) round += 1<<30;
if (shift > 31 && value < 0) round -= 1<<30;
}
- int64_t result = (int64_t)value * multiplier + round;
+ int64_t result = static_cast<int64_t>(value) * multiplier + round;
result = result >> shift;
// result will fit a 32-bit range due to the REQUIRE on value
- return (int32_t)result;
+ return static_cast<int32_t>(result);
}
int32_t apply_scale_16(int48_t value, int16_t multipler, int8_t shift) {
REQUIRE(multiplier >= 0);
REQUIRE(2 <= shift && shift <= 62);
int64_t round = (1 << (shift - 1));
- int64_t result = (int64_t)value * multiplier + round;
+ int64_t result = static_cast<int64_t>(value) * multiplier + round;
result = result >> shift;
REQUIRE(result >= minimum<int32_t> && result <= maximum<int32_t>);
- return (int32_t)result;
+ return static_cast<int32_t>(result);
}
----
@@ -665,9 +692,9 @@ All table lookups are based on the following reference lookup function that take
[source,c++]
----
-int32_t apply_lookup(int16_t *table, int32_t value)
+int32_t apply_lookup_s(int16_t *table, int32_t value)
{
- int16_t clipped_value = (int16_t)apply_clip<int32_t>(value, -32768, +32767);
+ int16_t clipped_value = static_cast<int16_t>(apply_clip_s<int32_t>(value, -32768, +32767));
int32_t index = (clipped_value + 32768) >> 7;
int32_t fraction = clipped_value & 0x7f;
int16_t base = table[index];
@@ -688,7 +715,7 @@ void generate_lookup_table(int16_t *table, int32_t (*reference)(int32_t))
{
for (int i = -256; i <= 256; i++) {
int32_t value = (*reference)(i);
- table[i + 256] = (int16_t)apply_clip<int32_t>(value, -32768, +32767)
+ table[i + 256] = static_cast<int16_t>(apply_clip<int32_t>(value, -32768, +32767));
}
}
----