// // This confidential and proprietary software may be used only as // authorised by a licensing agreement from ARM Limited // (C) COPYRIGHT 2020-2021 ARM Limited // ALL RIGHTS RESERVED // The entire notice above must be reproduced on all authorised // copies and copies may only be made to the extent permitted // by a licensing agreement from ARM Limited. === Elementwise Binary Operators ==== ADD Elementwise addition of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = apply_add(value1, value2); tensor_write(output, shape, index, acc); ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 32|int32_t |MI, MT|floating-point|float_t |=== ==== ARITHMETIC_RIGHT_SHIFT Elementwise arithmetic right shift of input1 by the amount specified in input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Input|bool_t |round |- | If true then the shift is rounded |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); assert(0 <= value2 && value2 <= 31); in_t acc = value1 >> value2; if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) { acc = acc + 1; } acc = apply_clip(acc, minimum, maximum) tensor_write(output, shape, index, acc) } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 8|int8_t |Any|signed 16|int16_t |Any|signed 32|int32_t |=== ==== BITWISE_AND Elementwise bitwise AND of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = value1 & value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 8|int8_t |Any|signed 16|int16_t |Any|signed 32|int32_t |=== ==== BITWISE_OR Elementwise bitwise OR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = value1 | value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 8|int8_t |Any|signed 16|int16_t |Any|signed 32|int32_t |=== ==== BITWISE_XOR Elementwise bitwise XOR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = value1 ^ value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 8|int8_t |Any|signed 16|int16_t |Any|signed 32|int32_t |=== ==== LOGICAL_AND Elementwise logical AND of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Quantization Parameters:* None *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = value1 && value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|Bool|bool_t |=== ==== LOGICAL_LEFT_SHIFT Elementwise left shift of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); assert(0 <= value2 && value2 <= 31); in_t acc = value1 << value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 8|int8_t |Any|signed 16|int16_t |Any|signed 32|int32_t |=== ==== LOGICAL_RIGHT_SHIFT Elementwise logical right shift of input1 by the amount specified in input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); assert(0 <= value2 && value2 <= 31); in_t acc = (unsigned in_t)value1 >> value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 8|int8_t |Any|signed 16|int16_t |Any|signed 32|int32_t |=== ==== LOGICAL_OR Elementwise logical OR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = value1 || value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|Bool|bool_t |=== ==== LOGICAL_XOR Elementwise logical XOR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = value1 != value2; tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|Bool|bool_t |=== ==== MAXIMUM Elementwise max of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = apply_max(value1, value2); tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 32|int32_t |MI, MT|floating-point|float_t |=== ==== MINIMUM Elementwise minimum of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = apply_min(value1, value2); tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 32|int32_t |MI, MT|floating-point|float_t |=== ==== MUL Elementwise multiplication (Hadamard product) of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Input (MT profile) Attribute (BI/MI profiles)|uint6_t|shift|-|Result right shift (int32_t data type only) |Output|out_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- assert(in_t == int32_t || shift == 0); for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); if (shift>0) { out_t acc = apply_scale_32(value1, value2, shift); } else { out_t acc = value1 * value2; // low 32-bits of result for int32_t } tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t|out_t |Any|signed 8|int8_t|int32_t |Any|signed 16|int16_t|int32_t |Any|signed 32|int32_t|int32_t |MI, MT|floating-point|float_t|float_t |=== ==== POW Elementwise input1 value raised to the power of input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor from 1 to 4 dims |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary |=== *Quantization Parameters:* Only supported with floating-point values. *Supported Data Types:* |=== |Profile|Mode|in_t |MI, MT|floating-point|float_t |=== ==== SUB Elementwise subtraction of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|input1|shape1|Input tensor |Input|in_t*|input2|shape2|Input tensor with the same rank as input1 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary |=== *Operation Function:* [source,c++] ---- for_each(index in shape) { index1 = apply_broadcast(shape, shape1, index); index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); in_t acc = apply_sub(value1, value2); tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t |Any|signed 32|int32_t |MI, MT|floating-point|float_t |=== ==== TABLE Interpolated table lookup operation. The int16_t input is treated as a fixed-point 9.7 value. The high 9 bits are used to index into the table. The fractional 7 bits are used to interpolate based on table[index] and table[index+1]. The TABLE operator returns a 16.7 interpolated value which can then be input to the RESCALE operator to scale to the required output data type. Note that table has 513 values to handle table[index+1] when index=511. An int8_t to int8_t table lookup can be constructed in TOSA as follows: * Use RESCALE (in_t=int8_t, out_t=int16_t, input_zp=0, scale=1<<14, shift=7) to perform a shift left of 7 and convert to int16_t * Use the TABLE operator to produce a fixed point 16.7 result. The lower 7 bits will be zero and only the central 256 table entries will be used. * Use RESCALE (in_t=int32_t, out_t=int8_t, scale=1<<14, shift=28) to scale the output to int8_t range (or alternate scale as required) * Note that this TOSA sequence can be implemented in software as a 256 entry 8-bit lookup table. An int16_t to int16_t table lookup can be constructed in TOSA as follows: * Use the TABLE operator to produce a fixed point 16.7 interpolated result * Use RESCALE (in_t=int32_t, out_t=int16_t, scale=1<<14, shift=21) to scale the output to int16_t range (or alternate scale as required) *Arguments:* |=== |Argument|Type|Name|Shape|Description |Input|in_t*|Input|shape|Input tensor |Input|table_t*|table|[513]|Lookup table tensor |Output|out_t*|output|shape|Output tensor |=== *Quantization Parameters:* None *Operation Function:* [source,c++] ---- for_each(index in shape) { in_t value = tensor_read(input, shape, index); out_t acc = apply_lookup(table, value); tensor_write(output, shape, index, acc); } ---- *Supported Data Types:* |=== |Profile|Mode|in_t|table_t|out_t |Any|signed 16|int16_t|int16_t|int32_t |===