pseudocode/library/numeric_conversion_helpers.tosac


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

//
// This confidential and proprietary software may be used only as
// authorised by a licensing agreement from ARM Limited
// (C) COPYRIGHT 2020-2024 ARM Limited
// ALL RIGHTS RESERVED
// The entire notice above must be reproduced on all authorised
// copies and copies may only be made to the extent permitted
// by a licensing agreement from ARM Limited.

// Converts the floating-point value to f, with rounding to the nearest integer value.
// For the required precision see the section: Main inference precision requirements.
int round_to_nearest_int(float_t f);

// Converts the input value into floating-point, rounding to the nearest representable value.
// Values that are not NaN outside of the representable range of the destination type must be set to infinity of the correct sign.
// For the required precision see the section: Main inference precision requirements.
float_t round_to_nearest_float_nonsaturating(in_t f);

// Converts the input value into floating-point, rounding to the nearest representable normal value.
// Values that are not NaN outside of the representable range must return the maximum representable normal value of the correct sign.
// For the required precision see the section: Main inference precision requirements.
float_t round_to_nearest_float_saturating(in_t f);

// Floating point values are unchanged.
// For two's complement integer values where out_t has more bits than in_t, replicate the top bit of input for all bits between the top bit of input and the top bit of output.
out_t sign_extend<out_t>(in_t input);

// Floating point values are unchanged.
// For two's complement integer values where out_t has more bits than in_t, insert zero values for all bits between the top bit of input and the top bit of output.
out_t zero_extend<out_t>(in_t input);

// output is the sizeof(out_t) least significant bits in input.
// Nop for floating-point types
out_t truncate(in_t input);