From aa162aa6d2287bcc7bfb7b976b3daabc84b62af4 Mon Sep 17 00:00:00 2001 From: Eric Kunze Date: Fri, 12 Apr 2024 16:19:55 -0700 Subject: Switch fp8 to use non-saturating mode when converting Implementations should use non-saturating mode and call CLAMP if saturation is needed. Signed-off-by: Eric Kunze Change-Id: I7a79931552dd6c3ab5fc247a963e3e7ba1e38ae2 --- chapters/introduction.adoc | 3 ++- pseudocode/library/generic_helpers.tosac | 9 --------- pseudocode/library/numeric_conversion_helpers.tosac | 8 ++------ pseudocode/operators/CAST.tosac | 6 +----- 4 files changed, 5 insertions(+), 21 deletions(-) diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc index 64d34e9..0030757 100644 --- a/chapters/introduction.adoc +++ b/chapters/introduction.adoc @@ -254,7 +254,8 @@ Otherwise the result must be within 0.5 ulp of the mathematical result. | <> | Result overflows when converting between fp32_t, bf16_t and fp16_t must be set to infinity of the correct sign. + -fp8e4m3_t and fp8e5m2_t must use the saturation mode rules defined in <> when converting from the wider floating-point types. + +fp8e4m3_t and fp8e5m2_t must use the non-saturating mode defined in <> when converting from the wider floating-point types. + +If saturation of the fp8 types is desired, a <> operation with the appropriate parameters should be used before the cast. + Floating-point result underflows must be set to zero of the correct sign. + Cast from floating-point to integer result overflows must be saturated. + Cast from floating-point to integer must be rounded using round to nearest, ties to even, rounding mode. + diff --git a/pseudocode/library/generic_helpers.tosac b/pseudocode/library/generic_helpers.tosac index eabc9b2..6dc2755 100644 --- a/pseudocode/library/generic_helpers.tosac +++ b/pseudocode/library/generic_helpers.tosac @@ -13,15 +13,6 @@ bool_t is_floating_point() { return false; } -bool_t is_saturating_float_type() { - // Saturate for the fp8 formats, all other floats do not saturate - if (type == fp8e4m3_t || type == fp8e5m2_t) { - return true; - } - return false; -} - - int32_t idiv(int32_t input1, int32_t input2) { return input1 / input2; // Integer divide that truncates towards zero } diff --git a/pseudocode/library/numeric_conversion_helpers.tosac b/pseudocode/library/numeric_conversion_helpers.tosac index 0073a66..ae5d9fb 100644 --- a/pseudocode/library/numeric_conversion_helpers.tosac +++ b/pseudocode/library/numeric_conversion_helpers.tosac @@ -13,13 +13,9 @@ int round_to_nearest_int(float_t f); // Converts the input value into floating-point, rounding to the nearest representable value. // Values that are not NaN outside of the representable range of the destination type must be set to infinity of the correct sign. +// If the destination floating point type does not have an infinity representation, values outside of the representable range must be set to NaN. // For the required precision see the section: Main inference precision requirements. -float_t round_to_nearest_float_nonsaturating(in_t f); - -// Converts the input value into floating-point, rounding to the nearest representable normal value. -// Values that are not NaN outside of the representable range must return the maximum representable normal value of the correct sign. -// For the required precision see the section: Main inference precision requirements. -float_t round_to_nearest_float_saturating(in_t f); +float_t round_to_nearest_float(in_t f); // Floating point values are unchanged. // For two's complement integer values where out_t has more bits than in_t, replicate the top bit of input for all bits between the top bit of input and the top bit of output. diff --git a/pseudocode/operators/CAST.tosac b/pseudocode/operators/CAST.tosac index b8fdea9..64af40a 100644 --- a/pseudocode/operators/CAST.tosac +++ b/pseudocode/operators/CAST.tosac @@ -17,11 +17,7 @@ for_each(index in shape) { if (in_t == bool_t) { out = (in) ? 1.0 : 0.0; } - if (is_saturating_float_type()) { - out = round_to_nearest_float_saturating(in); - } else { - out = round_to_nearest_float_nonsaturating(in); - } + out = round_to_nearest_float(in); } else { // Conversion to integer cases if (in_t == bool_t) { -- cgit v1.2.1