// // This confidential and proprietary software may be used only as // authorised by a licensing agreement from ARM Limited // (C) COPYRIGHT 2020-2023 ARM Limited // ALL RIGHTS RESERVED // The entire notice above must be reproduced on all authorised // copies and copies may only be made to the extent permitted // by a licensing agreement from ARM Limited. === Elementwise Binary Operators ==== ADD Elementwise addition of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/ADD.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = apply_add(value1, value2); tensor_write(output, shape, index, result); } ---- ==== ARITHMETIC_RIGHT_SHIFT Elementwise arithmetic right shift of input1 by the amount specified in input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/ARITHMETIC_RIGHT_SHIFT.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); // Ensure that shift amount is appropriate for the data type REQUIRE((in_out_t == int32_t && 0 <= value2 && value2 <= 31) || (in_out_t == int16_t && 0 <= value2 && value2 <= 15) || (in_out_t == int8_t && 0 <= value2 && value2 <= 7)); in_out_t result = value1 >> value2; if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) { result = result + 1; } result = apply_clip(result, minimum, maximum); tensor_write(output, shape, index, result); } ---- ==== BITWISE_AND Elementwise bitwise AND of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. include::{generated}/operators/BITWISE_AND.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = value1 & value2; tensor_write(output, shape, index, result); } ---- ==== BITWISE_OR Elementwise bitwise OR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. include::{generated}/operators/BITWISE_OR.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = value1 | value2; tensor_write(output, shape, index, result); } ---- ==== BITWISE_XOR Elementwise bitwise XOR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. include::{generated}/operators/BITWISE_XOR.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = value1 ^ value2; tensor_write(output, shape, index, result); } ---- ==== INTDIV Elementwise integer divide of input1 by input2. The result of the divide is truncated towards zero. Expected use is for operations on non-scaled integers. Floating point divide should use RECIPROCAL and MUL. Quantized integer divide should use TABLE (for 1/x) and MUL. include::{generated}/operators/INTDIV.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); REQUIRE(value2 != 0); // This catches the case where we divide minimum by -1 // which is not representable in two's complement REQUIRE((int64_t)value1 / value2 <= maximum); in_out_t result = value1 / value2; tensor_write(output, shape, index, result); } ---- ==== LOGICAL_AND Elementwise logical AND of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/LOGICAL_AND.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = value1 && value2; tensor_write(output, shape, index, result); } ---- ==== LOGICAL_LEFT_SHIFT Elementwise logical left shift of input1 by the amount specified in input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/LOGICAL_LEFT_SHIFT.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); REQUIRE(0 <= value2 && value2 <= 31); in_out_t result = value1 << value2; tensor_write(output, shape, index, result); } ---- ==== LOGICAL_RIGHT_SHIFT Elementwise logical right shift of input1 by the amount specified in input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/LOGICAL_RIGHT_SHIFT.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); REQUIRE(0 <= value2 && value2 <= 31); in_out_t result = (in_out_t)((unsigned in_out_t)value1 >> value2); tensor_write(output, shape, index, result); } ---- ==== LOGICAL_OR Elementwise logical OR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. include::{generated}/operators/LOGICAL_OR.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = value1 || value2; tensor_write(output, shape, index, result); } ---- ==== LOGICAL_XOR Elementwise logical XOR of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. include::{generated}/operators/LOGICAL_XOR.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = value1 != value2; tensor_write(output, shape, index, result); } ---- ==== MAXIMUM Elementwise max of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/MAXIMUM.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = apply_max(value1, value2); tensor_write(output, shape, index, result); } ---- ==== MINIMUM Elementwise minimum of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/MINIMUM.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = apply_min(value1, value2); tensor_write(output, shape, index, result); } ---- ==== MUL Elementwise multiplication (Hadamard product) of input1 and input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/MUL.adoc[] [source,c++] ---- ERROR_IF(in_t != int32_t && shift > 0); for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_t value1 = tensor_read(input1, shape1, index1); in_t value2 = tensor_read(input2, shape2, index2); out_t result; if (in_t == int32_t && shift > 0) { int64_t product = (int64_t)value1 * (int64_t)value2; int64_t round = (int64_t)1 << (shift-1); product = (product + round) >> shift; REQUIRE(product >= minimum && product <= maximum) result = product; } else { result = value1 * value2; // low 32-bits of result for int32_t } tensor_write(output, shape, index, result); } ---- ==== POW Elementwise input1 value raised to the power of input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match. include::{generated}/operators/POW.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = apply_pow(value1, value2); tensor_write(output, shape, index, result); } ---- ==== SUB Elementwise subtraction of input1 and input2. Axis of size 1 will be broadcast as necessary. Rank of input tensors must match. include::{generated}/operators/SUB.adoc[] [source,c++] ---- for_each(index in shape) { dim_t index1 = apply_broadcast(shape, shape1, index); dim_t index2 = apply_broadcast(shape, shape2, index); in_out_t value1 = tensor_read(input1, shape1, index1); in_out_t value2 = tensor_read(input2, shape2, index2); in_out_t result = apply_sub(value1, value2); tensor_write(output, shape, index, result); } ---- ==== TABLE Table lookup operation. For int8_t TABLE operation, perform a 256 entry table lookup returning an int8_t value. For int16_t tables, the int16_t input is treated as a fixed-point 9.7 value. The most significant 9 bits are used to index into the table. The fractional 7 bits are used to interpolate based on table[index] and table[index+1]. For int16_t inputs, the TABLE operator returns a 16.7 interpolated value in an int32_t. This value can then be input to the RESCALE operator to scale to the required output data type. Note that int16_t table has 513 values to handle table[index+1] when index=511. An int16_t to int16_t table lookup can be constructed in TOSA as follows: * Use the TABLE operator to produce a fixed point 16.7 interpolated result * Use RESCALE (in_t=int32_t, out_t=int16_t, scale=1<<14, shift=21) to scale the output to int16_t range (or alternate scale as required) include::{generated}/operators/TABLE.adoc[] [source,c++] ---- REQUIRE(length(table) == TABLE_SIZE); for_each(index in shape) { in_t value = tensor_read(input, shape, index); out_t result; if (in_t == int8_t) { // value is a signed int, convert to a 0 based index result = table[value + 128]; } else { result = apply_lookup(table, value); } tensor_write(output, shape, index, result); } ----