aboutsummaryrefslogtreecommitdiff
path: root/chapters/introduction.adoc
diff options
context:
space:
mode:
Diffstat (limited to 'chapters/introduction.adoc')
-rw-r--r--chapters/introduction.adoc33
1 files changed, 14 insertions, 19 deletions
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index b72472a..fd9ec25 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -204,11 +204,19 @@ A compliant implementation must satisfy the following:
===== Main Inference precision requirements
-In a compliant implementation, individual-floating point operations within the graph must meet the following accuracy bounds
-listed in the table below. In the table _ulp_ means unit of the last place.
+In a compliant implementation, individual floating-point operations within the graph must meet the accuracy bounds listed in the table following.
+In the table _ulp_ means unit of the last place.
+The function tosa_reference_check_fp() defines the error range permitted by a given number of units of last place in this specification.
NOTE: The error criteria in this section are at an early draft stage and are likely to change during conformance test development.
+Error criteria are specified for a single operator.
+For a sequence of n operators, A[0] to A[n-1], there must be corresponding implementations, I[0] to I[n-1], such that:
+
+* Each I[k] implements A[k] with same or higher precision datatypes
+* Each I[k] meets the accuracy defined in this specification for the A[k] precision
+* The accuracy of the sequence A[0] to A[n-1] is no worse than the accuracy of the sequence I[0] to I[n-1]
+
The following criteria apply to all operations:
* If any input is a NaN and the result is floating-point then the result must be a NaN
@@ -241,15 +249,14 @@ Floating-point result underflows must be set to zero of the correct sign. +
Addition of infinites of different signs must produce a NaN. +
Subtraction of infinities of the same sign must produce a NaN. +
Multiplication of an infinity by a zero must produce a NaN. +
-Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. +
-Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result.
+Otherwise the result must be within 0.5 ulp of the mathematical result.
| <<CAST>>
| Floating-point result overflows must be set to infinity of the correct sign. +
Floating-point result underflows must be set to zero of the correct sign. +
Cast from floating-point to integer result overflows must be saturated. +
-Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. +
-Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result.
+Cast from floating-point to integer must be rounded using round to nearest, ties to even, rounding mode. +
+Otherwise cast to floating-point must be within 0.5 ulp of the mathematical result.
| <<RECIPROCAL>>
| If the input is a zero or the result overlows the output must be an infinity of the same sign. +
@@ -334,18 +341,6 @@ size_t T = tensor_size(output_shape) // number dot product results
size_t ksb = (max_value(bias_abs) > 0) ? (KS + 1) : KS; // kernel size and bias
fp64_t out_err_sum = 0.0;
fp64_t out_err_sumsq = 0.0;
-fp64_t acc_prec; // 1<<(M+1) where M is the number of mantissa bits
-fp64_t acc_min_normal; // accumulator minimum normal greater than zero
-fp64_t two_m42 = 1.0/static_cast<fp64_t>((int64_t)1<<42); // pow(2, -42)
-switch (acc_t) {
- case fp32_t: acc_prec = static_cast<fp64_t>(1<<24); // pow(2, 24)
- acc_min_normal = two_m42 * two_m42 * two_m42; // pow(2, -126)
- break;
- case fp16_t: acc_prec = static_cast<fp64_t>(1<<11); // pow(2, 11)
- acc_min_normal = 1.0/static_cast<fp64_t>(1<<14); // pow(2, -14)
- break;
- default: ERROR_IF(true);
-}
for_each(index in output_shape) {
fp64_t out_bnd = tensor_read<fp64_t>(output_bnd, output_shape, index);
fp64_t out_ref = tensor_read<fp64_t>(output_ref, output_shape, index);
@@ -358,7 +353,7 @@ for_each(index in output_shape) {
REQUIRE(out_ref == 0.0 && out_imp == 0.0);
out_err = 0.0;
} else { // 0.0 < out_bnd < infinity
- fp64_t out_err_bnd = max(out_bnd / acc_prec, acc_min_normal);
+ fp64_t out_err_bnd = max(out_bnd * exp2(-1-normal_frac<acc_t>()), normal_min<acc_t>());
out_err = (static_cast<fp64_t>(out_imp) - out_ref) / out_err_bnd;
REQUIRE(abs(out_err) <= ksb);
}