From c237b7e430cddfb6036b3c6474a57f29ef2cdf64 Mon Sep 17 00:00:00 2001 From: Dominic Symes Date: Wed, 20 Sep 2023 15:08:53 +0100 Subject: Main conformance: clarify floating point accuracy Clarify how to compare with an error range of a given number of ulp in this specification. Denormals may be flushed to zero. Signed-off-by: Dominic Symes Change-Id: I1158e777030c4770d74f1acff84ab98e4a3420ac --- chapters/introduction.adoc | 33 ++++++++---------- chapters/pseudocode.adoc | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 19 deletions(-) diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc index b72472a..fd9ec25 100644 --- a/chapters/introduction.adoc +++ b/chapters/introduction.adoc @@ -204,11 +204,19 @@ A compliant implementation must satisfy the following: ===== Main Inference precision requirements -In a compliant implementation, individual-floating point operations within the graph must meet the following accuracy bounds -listed in the table below. In the table _ulp_ means unit of the last place. +In a compliant implementation, individual floating-point operations within the graph must meet the accuracy bounds listed in the table following. +In the table _ulp_ means unit of the last place. +The function tosa_reference_check_fp() defines the error range permitted by a given number of units of last place in this specification. NOTE: The error criteria in this section are at an early draft stage and are likely to change during conformance test development. +Error criteria are specified for a single operator. +For a sequence of n operators, A[0] to A[n-1], there must be corresponding implementations, I[0] to I[n-1], such that: + +* Each I[k] implements A[k] with same or higher precision datatypes +* Each I[k] meets the accuracy defined in this specification for the A[k] precision +* The accuracy of the sequence A[0] to A[n-1] is no worse than the accuracy of the sequence I[0] to I[n-1] + The following criteria apply to all operations: * If any input is a NaN and the result is floating-point then the result must be a NaN @@ -241,15 +249,14 @@ Floating-point result underflows must be set to zero of the correct sign. + Addition of infinites of different signs must produce a NaN. + Subtraction of infinities of the same sign must produce a NaN. + Multiplication of an infinity by a zero must produce a NaN. + -Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. + -Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result. +Otherwise the result must be within 0.5 ulp of the mathematical result. | <> | Floating-point result overflows must be set to infinity of the correct sign. + Floating-point result underflows must be set to zero of the correct sign. + Cast from floating-point to integer result overflows must be saturated. + -Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. + -Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result. +Cast from floating-point to integer must be rounded using round to nearest, ties to even, rounding mode. + +Otherwise cast to floating-point must be within 0.5 ulp of the mathematical result. | <> | If the input is a zero or the result overlows the output must be an infinity of the same sign. + @@ -334,18 +341,6 @@ size_t T = tensor_size(output_shape) // number dot product results size_t ksb = (max_value(bias_abs) > 0) ? (KS + 1) : KS; // kernel size and bias fp64_t out_err_sum = 0.0; fp64_t out_err_sumsq = 0.0; -fp64_t acc_prec; // 1<<(M+1) where M is the number of mantissa bits -fp64_t acc_min_normal; // accumulator minimum normal greater than zero -fp64_t two_m42 = 1.0/static_cast((int64_t)1<<42); // pow(2, -42) -switch (acc_t) { - case fp32_t: acc_prec = static_cast(1<<24); // pow(2, 24) - acc_min_normal = two_m42 * two_m42 * two_m42; // pow(2, -126) - break; - case fp16_t: acc_prec = static_cast(1<<11); // pow(2, 11) - acc_min_normal = 1.0/static_cast(1<<14); // pow(2, -14) - break; - default: ERROR_IF(true); -} for_each(index in output_shape) { fp64_t out_bnd = tensor_read(output_bnd, output_shape, index); fp64_t out_ref = tensor_read(output_ref, output_shape, index); @@ -358,7 +353,7 @@ for_each(index in output_shape) { REQUIRE(out_ref == 0.0 && out_imp == 0.0); out_err = 0.0; } else { // 0.0 < out_bnd < infinity - fp64_t out_err_bnd = max(out_bnd / acc_prec, acc_min_normal); + fp64_t out_err_bnd = max(out_bnd * exp2(-1-normal_frac()), normal_min()); out_err = (static_cast(out_imp) - out_ref) / out_err_bnd; REQUIRE(abs(out_err) <= ksb); } diff --git a/chapters/pseudocode.adoc b/chapters/pseudocode.adoc index d674c9c..efb50a0 100644 --- a/chapters/pseudocode.adoc +++ b/chapters/pseudocode.adoc @@ -412,6 +412,91 @@ out_t bitcast(in_t value) } ---- +==== Numeric Accuracy Helpers + +For a floating point number of type in_t a normal value is of the form (1.x * 2^e). +The fractional part 'x' has a number of fractional or mantissa bits depending on the type. +The exponent 'e' has a normal range depending on the type. +The functions below return the ranges according to type. + +[source,c++] +---- +fp64_t exp2(int n) { + REQUIRE(-1022 <= n && n <= 1023); + fp64_t v = 1.0; + while (n > 0) { v = v*2.0; n--; } + while (n < 0) { v = v/2.0; n++; } + return v; +} + +int ilog2(fp64_t v) { + REQURE(0 < v && v < infinity); + int n = 0; + while (v >= 2.0) { v = v/2.0; n++; } + while (v < 1.0) { v = v*2.0; n--; } + return n; +} + +fp64_t normal_min() { + switch (in_t) { + case fp32_t: return exp2(-126); + case fp16_t: return exp2( -14); + } +} + +fp64_t normal_max() { + switch (in_t) { + case fp32_t: return exp2(128) - exp2(127-23); + case fp16_t: return exp2( 16) - exp2( 15-10); + } +} + +// Number of fractional (mantissa bits) +int normal_frac () { + switch (in_t) { + case fp32_t: return 23; + case fp16_t: return 10; + } +} +---- + +The following function checks if a test value in floating-point format in_t is within an error range compared to a reference value. +The function assumes that denormal values may be flushed to zero. +The permitted range error is specified as num_ulp which in this spefication is the permitted range defined by the following function. + +[source,c++] +---- +bool tosa_reference_check_fp(in_t test_value, fp64_t ref_value, fp64_t num_ulp) { + if (is_a_NaN(ref_value)) { + return is_a_NaN(test_value); + } + if (ref_value < 0) { + ref_value = -ref_value; + test_value = -test_value; + } + fp64_t ref_min, ref_max; + if (ref_value == infinity) { + ref_min = infinity; + ref_max = infinity; + } else if (ref_value == 0) { + ref_min = 0; + ref_max = 0; + } else { + int ref_exp = ilog2(ref_value); + fp64_t ref_pow2 = max(exp2(ref_exp), normal_min); + fp64_t val_ulp = ref_pow2 * exp2(-normal_frac); + ref_max = ref_value + val_ulp * num_ulp; + ref_min = ref_value - val_ulp * num_ulp; + if (ref_max > normal_max) ref_max = infinity; + if (ref_min > normal_max) ref_min = infinity; + if (ref_max < normal_min) ref_max = normal_min; + if (ref_min < normal_min) ref_min = 0; + } + return (static_cast(test_value) >= ref_min && + static_cast(test_value) <= ref_max); +} +---- + ==== Numeric Conversion Helpers The following definitions are used in pseudocode to do numeric conversions. -- cgit v1.2.1