Switch fp8 to use non-saturating mode when converting

Implementations should use non-saturating mode and call CLAMP if saturation is needed. Signed-off-by: Eric Kunze <eric.kunze@arm.com> Change-Id: I7a79931552dd6c3ab5fc247a963e3e7ba1e38ae2
author: Eric Kunze <eric.kunze@arm.com> 2024-04-12 16:19:55 -0700
committer: Eric Kunze <eric.kunze@arm.com> 2024-04-17 23:56:37 +0000
commit: aa162aa6d2287bcc7bfb7b976b3daabc84b62af4 (patch)
tree: 2bcb24fe65343dd6cb43c16dbed518eeb19d3141
parent: 7ad78d37a51f8b333367effe62d596ac89cdcdb5 (diff)
download: specification-aa162aa6d2287bcc7bfb7b976b3daabc84b62af4.tar.gz
4 files changed, 5 insertions, 21 deletions
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index 64d34e9..0030757 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -254,7 +254,8 @@ Otherwise the result must be within 0.5 ulp of the mathematical result.
 
 | <<CAST>>
 | Result overflows when converting between fp32_t, bf16_t and fp16_t must be set to infinity of the correct sign. +
-fp8e4m3_t and fp8e5m2_t must use the saturation mode rules defined in <<IEEE-754,IEEE-754>> when converting from the wider floating-point types. +
+fp8e4m3_t and fp8e5m2_t must use the non-saturating mode defined in <<OCP-OFP8,OCP-OFP8>> when converting from the wider floating-point types. +
+If saturation of the fp8 types is desired, a <<CLAMP>> operation with the appropriate parameters should be used before the cast. +
 Floating-point result underflows must be set to zero of the correct sign. +
 Cast from floating-point to integer result overflows must be saturated. +
 Cast from floating-point to integer must be rounded using round to nearest, ties to even, rounding mode. +
diff --git a/pseudocode/library/generic_helpers.tosac b/pseudocode/library/generic_helpers.tosac
index eabc9b2..6dc2755 100644
--- a/pseudocode/library/generic_helpers.tosac
+++ b/pseudocode/library/generic_helpers.tosac
@@ -13,15 +13,6 @@ bool_t is_floating_point<type>() {
     return false;
 }
 
-bool_t is_saturating_float_type<type>() {
-    // Saturate for the fp8 formats, all other floats do not saturate
-    if (type == fp8e4m3_t || type == fp8e5m2_t) {
-        return true;
-    }
-    return false;
-}
-
-
 int32_t idiv(int32_t input1, int32_t input2) {
     return input1 / input2; // Integer divide that truncates towards zero
 }
diff --git a/pseudocode/library/numeric_conversion_helpers.tosac b/pseudocode/library/numeric_conversion_helpers.tosac
index 0073a66..ae5d9fb 100644
--- a/pseudocode/library/numeric_conversion_helpers.tosac
+++ b/pseudocode/library/numeric_conversion_helpers.tosac
@@ -13,13 +13,9 @@ int round_to_nearest_int(float_t f);
 
 // Converts the input value into floating-point, rounding to the nearest representable value.
 // Values that are not NaN outside of the representable range of the destination type must be set to infinity of the correct sign.
+// If the destination floating point type does not have an infinity representation, values outside of the representable range must be set to NaN.
 // For the required precision see the section: Main inference precision requirements.
-float_t round_to_nearest_float_nonsaturating(in_t f);
-
-// Converts the input value into floating-point, rounding to the nearest representable normal value.
-// Values that are not NaN outside of the representable range must return the maximum representable normal value of the correct sign.
-// For the required precision see the section: Main inference precision requirements.
-float_t round_to_nearest_float_saturating(in_t f);
+float_t round_to_nearest_float(in_t f);
 
 // Floating point values are unchanged.
 // For two's complement integer values where out_t has more bits than in_t, replicate the top bit of input for all bits between the top bit of input and the top bit of output.
diff --git a/pseudocode/operators/CAST.tosac b/pseudocode/operators/CAST.tosac
index b8fdea9..64af40a 100644
--- a/pseudocode/operators/CAST.tosac
+++ b/pseudocode/operators/CAST.tosac
@@ -17,11 +17,7 @@ for_each(index in shape) {
         if (in_t == bool_t) {
             out = (in) ? 1.0 : 0.0;
         }
-        if (is_saturating_float_type<out_t>()) {
-            out = round_to_nearest_float_saturating(in);
-        } else {
-            out = round_to_nearest_float_nonsaturating(in);
-        }
+        out = round_to_nearest_float(in);
     } else {
         // Conversion to integer cases
         if (in_t == bool_t) {
author	Eric Kunze <eric.kunze@arm.com>	2024-04-12 16:19:55 -0700
committer	Eric Kunze <eric.kunze@arm.com>	2024-04-17 23:56:37 +0000
commit	aa162aa6d2287bcc7bfb7b976b3daabc84b62af4 (patch)
tree	2bcb24fe65343dd6cb43c16dbed518eeb19d3141
parent	7ad78d37a51f8b333367effe62d596ac89cdcdb5 (diff)
download	specification-aa162aa6d2287bcc7bfb7b976b3daabc84b62af4.tar.gz