1 files changed, 126 insertions, 126 deletions
diff --git a/chapters/ewise_binary.adoc b/chapters/ewise_binary.adoc
index c624437..aa1c86c 100644
--- a/chapters/ewise_binary.adoc
+++ b/chapters/ewise_binary.adoc
@@ -19,9 +19,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -31,17 +31,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_add<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_add<in_out_t>(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -57,10 +57,10 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
 |Input|bool_t|round|-|If true then the shift is rounded
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -70,27 +70,27 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
 
     // Ensure that shift amount is appropriate for the data type
-    REQUIRE((in_t == int32_t && 0 <= value2 && value2 <= 31) ||
-            (in_t == int16_t && 0 <= value2 && value2 <= 15) ||
-            (in_t == int8_t && 0 <= value2 && value2 <= 7));
+    REQUIRE((in_out_t == int32_t && 0 <= value2 && value2 <= 31) ||
+            (in_out_t == int16_t && 0 <= value2 && value2 <= 15) ||
+            (in_out_t == int8_t && 0 <= value2 && value2 <= 7));
 
-    in_t result = value1 >> value2;
+    in_out_t result = value1 >> value2;
     if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) {
         result = result + 1;
     }
-    result = apply_clip<in_t>(result, minimum<in_t>, maximum<in_t>);
-    tensor_write<in_t>(output, shape, index, result);
+    result = apply_clip<in_out_t>(result, minimum<in_out_t>, maximum<in_out_t>);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -107,9 +107,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -119,17 +119,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 & value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 & value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -146,9 +146,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -158,17 +158,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 | value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 | value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -185,9 +185,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -197,17 +197,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 ^ value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 ^ value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -227,9 +227,9 @@ Quantized integer divide should use TABLE (for 1/x) and MUL.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -239,20 +239,20 @@ Quantized integer divide should use TABLE (for 1/x) and MUL.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
     REQUIRE(value2 != 0);
-    // This catches the case where we divide minimum<in_t> by -1
+    // This catches the case where we divide minimum<in_out_t> by -1
     // which is not representable in two's complement
-    REQUIRE((int64_t)value1 / value2 <= maximum<in_t>);
-    in_t result = value1 / value2;
-    tensor_write<in_t>(output, shape, index, result);
+    REQUIRE((int64_t)value1 / value2 <= maximum<in_out_t>);
+    in_out_t result = value1 / value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |===
@@ -267,9 +267,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -279,17 +279,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 && value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 && value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Bool|bool_t
 |===
@@ -304,9 +304,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -316,18 +316,18 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
-    in_t result = value1 << value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t result = value1 << value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -344,9 +344,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -356,18 +356,18 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
-    in_t result = (in_t)((unsigned in_t)value1 >> value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t result = (in_out_t)((unsigned in_out_t)value1 >> value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -384,9 +384,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -396,17 +396,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 || value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 || value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Bool|bool_t
 |===
@@ -421,9 +421,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -433,17 +433,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 != value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 != value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Bool|bool_t
 |===
@@ -458,9 +458,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -470,17 +470,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_max(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_max(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -496,9 +496,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -508,17 +508,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_min(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_min(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -579,9 +579,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor from 1 to 4 dims
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor from 1 to 4 dims
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -591,17 +591,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_pow<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_pow<in_out_t>(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -616,9 +616,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -628,17 +628,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_sub<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_sub<in_out_t>(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t