From c237b7e430cddfb6036b3c6474a57f29ef2cdf64 Mon Sep 17 00:00:00 2001
From: Dominic Symes <dominic.symes@arm.com>
Date: Wed, 20 Sep 2023 15:08:53 +0100
Subject: Main conformance: clarify floating point accuracy

Clarify how to compare with an error range of a given
number of ulp in this specification. Denormals may be
flushed to zero.

Signed-off-by: Dominic Symes <dominic.symes@arm.com>
Change-Id: I1158e777030c4770d74f1acff84ab98e4a3420ac
---
 chapters/introduction.adoc | 33 ++++++++----------
 chapters/pseudocode.adoc   | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 19 deletions(-)
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index b72472a..fd9ec25 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -204,11 +204,19 @@ A compliant implementation must satisfy the following:
 
 ===== Main Inference precision requirements
 
-In a compliant implementation, individual-floating point operations within the graph must meet the following accuracy bounds
-listed in the table below. In the table _ulp_ means unit of the last place.
+In a compliant implementation, individual floating-point operations within the graph must meet the accuracy bounds listed in the table following.
+In the table _ulp_ means unit of the last place.
+The function tosa_reference_check_fp() defines the error range permitted by a given number of units of last place in this specification.
 
 NOTE: The error criteria in this section are at an early draft stage and are likely to change during conformance test development.
 
+Error criteria are specified for a single operator.
+For a sequence of n operators, A[0] to A[n-1], there must be corresponding implementations, I[0] to I[n-1], such that:
+
+* Each I[k] implements A[k] with same or higher precision datatypes
+* Each I[k] meets the accuracy defined in this specification for the A[k] precision
+* The accuracy of the sequence A[0] to A[n-1] is no worse than the accuracy of the sequence I[0] to I[n-1]
+
 The following criteria apply to all operations:
 
 * If any input is a NaN and the result is floating-point then the result must be a NaN
@@ -241,15 +249,14 @@ Floating-point result underflows must be set to zero of the correct sign. +
 Addition of infinites of different signs must produce a NaN. +
 Subtraction of infinities of the same sign must produce a NaN. +
 Multiplication of an infinity by a zero must produce a NaN. +
-Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. +
-Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result.
+Otherwise the result must be within 0.5 ulp of the mathematical result.
 
 | <<CAST>>
 | Floating-point result overflows must be set to infinity of the correct sign. +
 Floating-point result underflows must be set to zero of the correct sign. +
 Cast from floating-point to integer result overflows must be saturated. +
-Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. +
-Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result.
+Cast from floating-point to integer must be rounded using round to nearest, ties to even, rounding mode. +
+Otherwise cast to floating-point must be within 0.5 ulp of the mathematical result.
 
 | <<RECIPROCAL>>
 | If the input is a zero or the result overlows the output must be an infinity of the same sign. +
@@ -334,18 +341,6 @@ size_t T = tensor_size(output_shape)  // number dot product results
 size_t ksb = (max_value(bias_abs) > 0) ? (KS + 1) : KS; // kernel size and bias
 fp64_t out_err_sum = 0.0;
 fp64_t out_err_sumsq = 0.0;
-fp64_t acc_prec;       // 1<<(M+1) where M is the number of mantissa bits
-fp64_t acc_min_normal; // accumulator minimum normal greater than zero
-fp64_t two_m42 = 1.0/static_cast<fp64_t>((int64_t)1<<42);         // pow(2,  -42)
-switch (acc_t) {
-    case fp32_t: acc_prec       = static_cast<fp64_t>(1<<24);     // pow(2,   24)
-                 acc_min_normal = two_m42 * two_m42 * two_m42;    // pow(2, -126)
-                 break;
-    case fp16_t: acc_prec       = static_cast<fp64_t>(1<<11);     // pow(2,   11)
-                 acc_min_normal = 1.0/static_cast<fp64_t>(1<<14); // pow(2,  -14)
-                 break;
-    default:     ERROR_IF(true);
-}
 for_each(index in output_shape) {
     fp64_t out_bnd = tensor_read<fp64_t>(output_bnd, output_shape, index);
     fp64_t out_ref = tensor_read<fp64_t>(output_ref, output_shape, index);
@@ -358,7 +353,7 @@ for_each(index in output_shape) {
         REQUIRE(out_ref == 0.0 && out_imp == 0.0);
         out_err = 0.0;
     } else {  // 0.0 < out_bnd < infinity
-        fp64_t out_err_bnd = max(out_bnd / acc_prec, acc_min_normal);
+        fp64_t out_err_bnd = max(out_bnd * exp2(-1-normal_frac<acc_t>()), normal_min<acc_t>());
         out_err = (static_cast<fp64_t>(out_imp) - out_ref) / out_err_bnd;
         REQUIRE(abs(out_err) <= ksb);
     }
diff --git a/chapters/pseudocode.adoc b/chapters/pseudocode.adoc
index d674c9c..efb50a0 100644
--- a/chapters/pseudocode.adoc
+++ b/chapters/pseudocode.adoc
@@ -412,6 +412,91 @@ out_t bitcast<out_t>(in_t value)
 }
 ----
 
+==== Numeric Accuracy Helpers
+
+For a floating point number of type in_t a normal value is of the form (1.x * 2^e).
+The fractional part 'x' has a number of fractional or mantissa bits depending on the type.
+The exponent 'e' has a normal range depending on the type.
+The functions below return the ranges according to type.
+
+[source,c++]
+----
+fp64_t exp2(int n) {
+    REQUIRE(-1022 <= n && n <= 1023);
+    fp64_t v = 1.0;
+    while (n > 0) { v = v*2.0; n--; }
+    while (n < 0) { v = v/2.0; n++; }
+    return v;
+}
+
+int ilog2(fp64_t v) {
+    REQURE(0 < v && v < infinity);
+    int n = 0;
+    while (v >= 2.0) { v = v/2.0; n++; }
+    while (v <  1.0) { v = v*2.0; n--; }
+    return n;
+}
+
+fp64_t normal_min<in_t>() {
+  switch (in_t) {
+    case fp32_t: return exp2(-126);
+    case fp16_t: return exp2( -14);
+  }
+}
+
+fp64_t normal_max<in_t>() {
+  switch (in_t) {
+    case fp32_t: return exp2(128) - exp2(127-23);
+    case fp16_t: return exp2( 16) - exp2( 15-10);
+  }
+}
+
+// Number of fractional (mantissa bits)
+int normal_frac<in_t> () {
+  switch (in_t) {
+    case fp32_t: return 23;
+    case fp16_t: return 10;
+  }
+}
+----
+
+The following function checks if a test value in floating-point format in_t is within an error range compared to a reference value.
+The function assumes that denormal values may be flushed to zero.
+The permitted range error is specified as num_ulp which in this spefication is the permitted range defined by the following function.
+
+[source,c++]
+----
+bool tosa_reference_check_fp<in_t>(in_t test_value, fp64_t ref_value, fp64_t num_ulp) {
+  if (is_a_NaN(ref_value)) {
+    return is_a_NaN(test_value);
+  }
+  if (ref_value < 0) {
+    ref_value  = -ref_value;
+    test_value = -test_value;
+  }
+  fp64_t ref_min, ref_max;
+  if (ref_value == infinity) {
+    ref_min = infinity;
+    ref_max = infinity;
+  } else if (ref_value == 0) {
+    ref_min = 0;
+    ref_max = 0;
+  } else {
+    int ref_exp = ilog2(ref_value);
+    fp64_t ref_pow2 = max(exp2(ref_exp), normal_min<in_t>);
+    fp64_t val_ulp  = ref_pow2 * exp2(-normal_frac<in_t>);
+    ref_max = ref_value + val_ulp * num_ulp;
+    ref_min = ref_value - val_ulp * num_ulp;
+    if (ref_max > normal_max<in_t>) ref_max = infinity;
+    if (ref_min > normal_max<in_t>) ref_min = infinity;
+    if (ref_max < normal_min<in_t>) ref_max = normal_min<in_t>;
+    if (ref_min < normal_min<in_t>) ref_min = 0;
+  }
+  return (static_cast<fp64_t>(test_value) >= ref_min &&
+          static_cast<fp64_t>(test_value) <= ref_max);
+}
+----
+
 ==== Numeric Conversion Helpers
 
 The following definitions are used in pseudocode to do numeric conversions.
-- 
cgit v1.2.1