From a0e9a523fcee7f25d4a81289cf10e9f9082ee878 Mon Sep 17 00:00:00 2001
From: Eric Kunze <eric.kunze@arm.com>
Date: Fri, 12 Nov 2021 16:15:47 -0800
Subject: Use in_out_t when a type is used for input and output

Also change acc_t to out_t when the value is being used as an output.

This should make the argument tables easier to follow.

Change-Id: I2a57f3c4eaf937f29da785ff5c11576663a39494
---
 chapters/activation_funcs.adoc |  28 ++---
 chapters/data_layout.adoc      |  68 +++++------
 chapters/data_nodes.adoc       |   6 +-
 chapters/ewise_binary.adoc     | 252 ++++++++++++++++++++---------------------
 chapters/ewise_ternary.adoc    |  16 +--
 chapters/ewise_unary.adoc      | 140 +++++++++++------------
 chapters/reduction.adoc        |  90 +++++++--------
 chapters/tensor_ops.adoc       | 140 +++++++++++------------
 8 files changed, 370 insertions(+), 370 deletions(-)

diff --git a/chapters/activation_funcs.adoc b/chapters/activation_funcs.adoc
index a58a1fc..84a1039 100644
--- a/chapters/activation_funcs.adoc
+++ b/chapters/activation_funcs.adoc
@@ -20,10 +20,10 @@ No zero point subtraction is done to the values, thus to clamp to the zero point
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor
-|Attribute|in_t|min_val|-|minimum clip value
-|Attribute|in_t|max_val|-|maximum clip value
-|Output|in_t*|Output|shape|Output tensor of same type and shape as input
+|Input|in_out_t*|Input|shape|Input tensor
+|Attribute|in_out_t|min_val|-|minimum clip value
+|Attribute|in_out_t|max_val|-|maximum clip value
+|Output|in_out_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Operation Function:*
@@ -31,16 +31,16 @@ No zero point subtraction is done to the values, thus to clamp to the zero point
 ----
 ERROR_IF(max_val < min_val);
 for_each(index in shape) {
-    in_t value = tensor_read<in_t>(input, shape, index);
-    value = apply_clip<in_t>(value, min_val, max_val);
-    tensor_write<in_t>(output, shape, index, value);
+    in_out_t value = tensor_read<in_out_t>(input, shape, index);
+    value = apply_clip<in_out_t>(value, min_val, max_val);
+    tensor_write<in_out_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -72,14 +72,14 @@ generate_lookup_table(&sigmoid_table, &sigmoid_reference);
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor
-|Output|in_t*|Output|shape|Output tensor of same type and shape as input
+|Input|in_out_t*|Input|shape|Input tensor
+|Output|in_out_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -110,14 +110,14 @@ generate_lookup_table(&tanh_table, &tanh_reference);
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor
-|Output|in_t*|Output|shape|Output tensor of same type and shape as input
+|Input|in_out_t*|Input|shape|Input tensor
+|Output|in_out_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
diff --git a/chapters/data_layout.adoc b/chapters/data_layout.adoc
index 54221f6..e50a14e 100644
--- a/chapters/data_layout.adoc
+++ b/chapters/data_layout.adoc
@@ -18,9 +18,9 @@ No data conversion happens during a concat operation.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shapes1[]|List of input tensors. All inputs must have the same rank and data type
+|Input|in_out_t*|input1|shapes1[]|List of input tensors. All inputs must have the same rank and data type
 |Attribute|int|axis|-|Axis along which concatenation is to occur, in range from 0 to rank(shape)-1
-|Output|in_t*|output|shape|Output tensor
+|Output|in_out_t*|output|shape|Output tensor
 |===
 
 *Operation Function:*
@@ -43,8 +43,8 @@ for_each(index1 in shape) {
         // For each output location, we are looking for the
         // appropriate input tensor
         if (index2[axis] >= 0 && index2[axis] < shapes1[t][axis]) {
-            in_t value = tensor_read<in_t>(input1[t], shapes1[t], index2);
-            tensor_write<in_t>(output, shape, index1, value);
+            in_out_t value = tensor_read<in_out_t>(input1[t], shapes1[t], index2);
+            tensor_write<in_out_t>(output, shape, index1, value);
         }
         index2[axis] = index2[axis] - shapes1[t][axis];
     }
@@ -55,7 +55,7 @@ for_each(index1 in shape) {
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
@@ -75,10 +75,10 @@ The pad_const value includes the zero point if the tensor uses a zero point.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input1|shape1|Input tensor
 |Attribute|int|padding|[rank(input1),2]|Amount of padding to be done
-|Attribute|in_t|pad_const|-|Constant value to be used as padding
-|Output|in_t*|output|shape|Output tensor of same type as the input tensor
+|Attribute|in_out_t|pad_const|-|Constant value to be used as padding
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensor
 |===
 
 *Operation Function:*
@@ -98,15 +98,15 @@ for_each(index in shape) {
             is_pad = true;
         }
     }
-    in_t value = is_pad ? pad_const : tensor_read<in_t>(input1, shape1, index1);
-    tensor_write<in_t>(output, shape, index, value);
+    in_out_t value = is_pad ? pad_const : tensor_read<in_out_t>(input1, shape1, index1);
+    tensor_write<in_out_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
@@ -124,9 +124,9 @@ Returns a tensor with the same type/values as the input, with a new shape specif
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input1|shape1|Input tensor
 |Attribute|int|new_shape|[rank(output)]|List of values, with each element giving the size of the result tensor for the given dimension. At most one dimension may be given as -1 to automatically calculate the dimension size.
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -142,7 +142,7 @@ for(i = 0; i < tensor_size(shape); i++) {
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
@@ -160,9 +160,9 @@ Returns a tensor with the same type/values as the input, with the data reversed
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape|Input tensor from 1 to 4 dims
+|Input|in_out_t*|input|shape|Input tensor from 1 to 4 dims
 |Attribute|int|axis|-|Axis to reverse, in range from 0 to rank(shape)-1
-|Output|in_t*|output|shape|Output tensor. Same shape as input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same shape as input tensor.
 |===
 
 *Operation Function:*
@@ -173,15 +173,15 @@ ERROR_IF(axis < 0 || axis >= rank(shape));
 for_each(index in shape) {
     tmp_index = index;
     tmp_index[axis] = shape[axis] - 1 - index[axis];
-    in_t value = tensor_read<in_t>(input, shape, tmp_index);
-    tensor_write<in_t>(output, shape, index, value);
+    in_out_t value = tensor_read<in_out_t>(input, shape, tmp_index);
+    tensor_write<in_out_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
@@ -199,10 +199,10 @@ No data conversion happens during a slice operation.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input1|shape1|Input tensor with rank from 1 to 4
 |Attribute|int|start|[rank(input1)]|List of integer coordinates, of length equal to the rank of input1. Start coordinate for slicing.
 |Attribute|int|size|[rank(input1)]|List of integer size values, of length equal to the rank of input1. Size of the input to be used.
-|Output|in_t*|output|shape|Output tensor of same type as the input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensor
 |===
 
 *Operation Function:*
@@ -225,15 +225,15 @@ for_each(index in shape) {
     for(i = 0; i < rank(shape); i++) {
        tmp_index[i] = index[i] + start[i];
     }
-    in_t value = tensor_read<in_t>(input, shape1, tmp_index);
-    tensor_write<in_t>(output, shape, index, value);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, tmp_index);
+    tensor_write<in_out_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
@@ -251,9 +251,9 @@ Replicates input1 multiplies times along each dimension.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input1|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|multiplies|[rank(shape1)]|Number of times to replicate input1 in each dimension
-|Output|in_t*|output|shape|Output tensor of same type, rank as the input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, rank as the input tensor
 |===
 
 *Operation Function:*
@@ -266,15 +266,15 @@ for_each(index in shape) {
         REQUIRE(shape1[i] * multiplies[i] == shape[i]);
         tmp_index[i] = index[i] % shape1[i];
     }
-    in_t value = tensor_read<in_t>(input, shape1, tmp_index);
-    tensor_write<in_t>(output, shape, index, value);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, tmp_index);
+    tensor_write<in_out_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
@@ -292,9 +292,9 @@ Permutes the dimensions based on perm.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor with minimum rank of one.
+|Input|in_out_t*|input1|shape1|Input tensor with minimum rank of one.
 |Attribute|int32_t|perms|[rank(input1)]|List of integers of length equal to the rank of input1. Values must be valid dimensions within shape1, and may not be repeated.
-|Output|in_t*|output|shape|Output tensor of same type, rank as the input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, rank as the input tensor
 |===
 
 *Operation Function:*
@@ -315,15 +315,15 @@ for_each(index in shape) {
         REQUIRE(shape1[perm[i]] == shape[i])
         tmp_index[perm[i]] = index[i]
     }
-    in_t value = tensor_read<in_t>(input, shape1, tmp_index);
-    tensor_write<in_t>(output, shape, index, value);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, tmp_index);
+    tensor_write<in_out_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
diff --git a/chapters/data_nodes.adoc b/chapters/data_nodes.adoc
index e4a3e88..9d32a62 100644
--- a/chapters/data_nodes.adoc
+++ b/chapters/data_nodes.adoc
@@ -43,14 +43,14 @@ Returns a tensor with the same shape, type, and contents as the input.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |Any|signed 8|int8_t
diff --git a/chapters/ewise_binary.adoc b/chapters/ewise_binary.adoc
index c624437..aa1c86c 100644
--- a/chapters/ewise_binary.adoc
+++ b/chapters/ewise_binary.adoc
@@ -19,9 +19,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -31,17 +31,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_add<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_add<in_out_t>(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -57,10 +57,10 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
 |Input|bool_t|round|-|If true then the shift is rounded
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -70,27 +70,27 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
 
     // Ensure that shift amount is appropriate for the data type
-    REQUIRE((in_t == int32_t && 0 <= value2 && value2 <= 31) ||
-            (in_t == int16_t && 0 <= value2 && value2 <= 15) ||
-            (in_t == int8_t && 0 <= value2 && value2 <= 7));
+    REQUIRE((in_out_t == int32_t && 0 <= value2 && value2 <= 31) ||
+            (in_out_t == int16_t && 0 <= value2 && value2 <= 15) ||
+            (in_out_t == int8_t && 0 <= value2 && value2 <= 7));
 
-    in_t result = value1 >> value2;
+    in_out_t result = value1 >> value2;
     if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) {
         result = result + 1;
     }
-    result = apply_clip<in_t>(result, minimum<in_t>, maximum<in_t>);
-    tensor_write<in_t>(output, shape, index, result);
+    result = apply_clip<in_out_t>(result, minimum<in_out_t>, maximum<in_out_t>);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -107,9 +107,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -119,17 +119,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 & value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 & value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -146,9 +146,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -158,17 +158,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 | value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 | value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -185,9 +185,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -197,17 +197,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 ^ value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 ^ value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -227,9 +227,9 @@ Quantized integer divide should use TABLE (for 1/x) and MUL.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -239,20 +239,20 @@ Quantized integer divide should use TABLE (for 1/x) and MUL.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
     REQUIRE(value2 != 0);
-    // This catches the case where we divide minimum<in_t> by -1
+    // This catches the case where we divide minimum<in_out_t> by -1
     // which is not representable in two's complement
-    REQUIRE((int64_t)value1 / value2 <= maximum<in_t>);
-    in_t result = value1 / value2;
-    tensor_write<in_t>(output, shape, index, result);
+    REQUIRE((int64_t)value1 / value2 <= maximum<in_out_t>);
+    in_out_t result = value1 / value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |===
@@ -267,9 +267,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -279,17 +279,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 && value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 && value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Bool|bool_t
 |===
@@ -304,9 +304,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -316,18 +316,18 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
-    in_t result = value1 << value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t result = value1 << value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -344,9 +344,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -356,18 +356,18 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
-    in_t result = (in_t)((unsigned in_t)value1 >> value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t result = (in_out_t)((unsigned in_out_t)value1 >> value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -384,9 +384,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -396,17 +396,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 || value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 || value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Bool|bool_t
 |===
@@ -421,9 +421,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -433,17 +433,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = value1 != value2;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = value1 != value2;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Bool|bool_t
 |===
@@ -458,9 +458,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -470,17 +470,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_max(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_max(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -496,9 +496,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -508,17 +508,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_min(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_min(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -579,9 +579,9 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor from 1 to 4 dims
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor from 1 to 4 dims
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -591,17 +591,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_pow<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_pow<in_out_t>(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -616,9 +616,9 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor
-|Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
+|Input|in_out_t*|input1|shape1|Input tensor
+|Input|in_out_t*|input2|shape2|Input tensor with the same rank as input1
+|Output|in_out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -628,17 +628,17 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t result = apply_sub<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t result = apply_sub<in_out_t>(value1, value2);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
diff --git a/chapters/ewise_ternary.adoc b/chapters/ewise_ternary.adoc
index 751c093..e61e1c2 100644
--- a/chapters/ewise_ternary.adoc
+++ b/chapters/ewise_ternary.adoc
@@ -19,9 +19,9 @@ Elementwise select of the output based on a condition.
 |Argument|Type|Name|Shape|Description
 
 |Input|cmp_t|input1|shape1|Input selector tensor
-|Input|in_t*|input2|shape2|Input value tensor if input1 is True
-|Input|in_t*|input3|shape3|Input value tensor if input1 is False
-|Output|in_t*|output|shape|Output tensor of same type as input2 and input3, with broadcast shape if necessary
+|Input|in_out_t*|input2|shape2|Input value tensor if input1 is True
+|Input|in_out_t*|input3|shape3|Input value tensor if input1 is False
+|Output|in_out_t*|output|shape|Output tensor of same type as input2 and input3, with broadcast shape if necessary
 |===
 
 *Operation Function:*
@@ -33,21 +33,21 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     index3 = apply_broadcast(shape, shape3, index);
     cmp_t value1 = tensor_read<cmp_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t value3 = tensor_read<in_t>(input3, shape3, index3);
-    in_t result;
+    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+    in_out_t value3 = tensor_read<in_out_t>(input3, shape3, index3);
+    in_out_t result;
     if (value1) {
          result = value2;
     } else {
          result = value3;
     }
-    tensor_write<in_t>(output, shape, index, result);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|cmp_t|in_t
+|Profile|Mode|cmp_t|in_out_t
 
 |Any|Boolean|bool_t|bool_t
 |Any|signed 8|bool_t|int8_t
diff --git a/chapters/ewise_unary.adoc b/chapters/ewise_unary.adoc
index 633b8ac..326cc3c 100644
--- a/chapters/ewise_unary.adoc
+++ b/chapters/ewise_unary.adoc
@@ -18,8 +18,8 @@ Elementwise absolute value operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -27,17 +27,17 @@ Elementwise absolute value operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
     if (value1 < 0)
-        value1 = apply_sub<in_t>(0, value1);
-    tensor_write<in_t>(output, shape, index, value1);
+        value1 = apply_sub<in_out_t>(0, value1);
+    tensor_write<in_out_t>(output, shape, index, value1);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
@@ -52,8 +52,8 @@ Elementwise bitwise NOT of input tensor.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -61,16 +61,16 @@ Elementwise bitwise NOT of input tensor.
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t result = ~value1;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
+    in_out_t result = ~value1;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -86,8 +86,8 @@ Elementwise ceiling operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -95,16 +95,16 @@ Elementwise ceiling operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t result = apply_ceil<in_t>(value1);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
+    in_out_t result = apply_ceil<in_out_t>(value1);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -118,8 +118,8 @@ Elementwise count leading zeros operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -127,15 +127,15 @@ Elementwise count leading zeros operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t result = count_leading_zeros(value1);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
+    in_out_t result = count_leading_zeros(value1);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |===
@@ -149,8 +149,8 @@ Elementwise e to the x operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -158,16 +158,16 @@ Elementwise e to the x operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t result = apply_exp<in_t>(value1);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
+    in_out_t result = apply_exp<in_out_t>(value1);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -181,8 +181,8 @@ Elementwise floor operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -190,16 +190,16 @@ Elementwise floor operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t result = apply_floor<in_t>(value1);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
+    in_out_t result = apply_floor<in_out_t>(value1);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -213,8 +213,8 @@ Elementwise natural logarithm operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -222,16 +222,16 @@ Elementwise natural logarithm operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t result = apply_log<in_t>(value1);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
+    in_out_t result = apply_log<in_out_t>(value1);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -245,8 +245,8 @@ Elementwise logical NOT of input.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -254,16 +254,16 @@ Elementwise logical NOT of input.
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape1, index);
-    in_t result = !value1;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index);
+    in_out_t result = !value1;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|bool|bool_t
 |===
@@ -277,31 +277,31 @@ Elementwise negation operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Attribute|in_t|input1_zp|-|Input 1 zero point. Must be zero for non-int8 types.
-|Attribute|in_t|output_zp|-|Output zero point. Must be zero for non-int8 types.
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Attribute|in_out_t|input1_zp|-|Input 1 zero point. Must be zero for non-int8 types.
+|Attribute|in_out_t|output_zp|-|Output zero point. Must be zero for non-int8 types.
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
 
 [source,c++]
 ----
-ERROR_IF(in_t != int8_t && input1_zp != 0) // Zero point only for int8_t
-ERROR_IF(in_t != int8_t && output_zp != 0) // Zero point only for int8_t
+ERROR_IF(in_out_t != int8_t && input1_zp != 0) // Zero point only for int8_t
+ERROR_IF(in_out_t != int8_t && output_zp != 0) // Zero point only for int8_t
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape, index);
     acc_t value = (acc_t)value1 - input1_zp;
     value = apply_sub<acc_t>(0, value);
-    in_t result = (in_t)apply_clip<acc_t>(value + output_zp, minimum<in_t>, maximum<in_t>);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t result = (in_out_t)apply_clip<acc_t>(value + output_zp, minimum<in_out_t>, maximum<in_out_t>);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|acc_t
+|Profile|Mode|in_out_t|acc_t
 
 |Any|signed 8|int8_t|int32_t
 |Any|signed 16|int16_t|int32_t
@@ -318,8 +318,8 @@ Elementwise reciprocal operation. For integer operation, a TABLE should be used
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -327,16 +327,16 @@ Elementwise reciprocal operation. For integer operation, a TABLE should be used
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape1, index);
-    in_t result = 1.0 / value1;
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index);
+    in_out_t result = 1.0 / value1;
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -350,8 +350,8 @@ Elementwise reciprocal square root operation. For integer operation, a TABLE sho
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape|Input tensor
-|Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
+|Input|in_out_t*|input1|shape|Input tensor
+|Output|in_out_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
 *Operation Function:*
@@ -359,16 +359,16 @@ Elementwise reciprocal square root operation. For integer operation, a TABLE sho
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape1, index);
-    in_t result = 1.0 / apply_sqrt<in_t>(value1);
-    tensor_write<in_t>(output, shape, index, result);
+    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index);
+    in_out_t result = 1.0 / apply_sqrt<in_out_t>(value1);
+    tensor_write<in_out_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
diff --git a/chapters/reduction.adoc b/chapters/reduction.adoc
index 11db960..fdf30df 100644
--- a/chapters/reduction.adoc
+++ b/chapters/reduction.adoc
@@ -18,9 +18,9 @@ Reduce a tensor along the given axis with a logical AND operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|axis|-|Axis to reduce, in range from 0 to rank(shape1)-1
-|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
@@ -32,22 +32,22 @@ ERROR_IF(shape[axis] != 1);
 
 // Initialize output state to true
 for_each(index in shape) {
-    tensor_write<in_t>(output, shape, index, true);
+    tensor_write<in_out_t>(output, shape, index, true);
 }
 for_each(index in shape1) {
     out_index = index;
     out_index[axis] = 0;
-    in_t value = tensor_read<in_t>(input, shape1, index);
-    in_t state = tensor_read<in_t>(output, shape, out_index);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
     state      = state && value;
-    tensor_write<in_t>(output, shape, out_index, state);
+    tensor_write<in_out_t>(output, shape, out_index, state);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |===
@@ -61,9 +61,9 @@ Reduce a tensor along the given axis with a logical OR operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|axis|-|Axis to reduce, in range from 0 to rank(shape1)-1
-|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
@@ -75,22 +75,22 @@ ERROR_IF(shape[axis] != 1);
 
 // Initialize output state to false
 for_each(index in shape) {
-    tensor_write<in_t>(output, shape, index, false);
+    tensor_write<in_out_t>(output, shape, index, false);
 }
 for_each(index in shape1) {
     out_index = index;
     out_index[axis] = 0;
-    in_t value = tensor_read<in_t>(input, shape1, index);
-    in_t state = tensor_read<in_t>(output, shape, out_index);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
     state      = state || value;
-    tensor_write<in_t>(output, shape, out_index, state);
+    tensor_write<in_out_t>(output, shape, out_index, state);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|Boolean|bool_t
 |===
@@ -104,9 +104,9 @@ Reduce a tensor along the given axis with a maximum operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|axis|-|Axis to reduce, in range from 0 to rank(shape1)-1
-|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
@@ -116,22 +116,22 @@ Reduce a tensor along the given axis with a maximum operation
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
 for_each(index in shape) {
-    tensor_write<in_t>(output, shape, index, minimum<in_t>);
+    tensor_write<in_out_t>(output, shape, index, minimum<in_out_t>);
 }
 for_each(index in shape1) {
     out_index = index;
     out_index[axis] = 0;
-    in_t value = tensor_read<in_t>(input, shape1, index);
-    in_t state = tensor_read<in_t>(output, shape, out_index);
-    state      = apply_max<in_t>(state, value);
-    tensor_write<in_t>(output, shape, out_index, state);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
+    state      = apply_max<in_out_t>(state, value);
+    tensor_write<in_out_t>(output, shape, out_index, state);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -147,9 +147,9 @@ Reduce a tensor along the given axis with a minimum operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|axis|-|Axis to reduce, in range from 0 to rank(shape1)-1
-|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
@@ -159,22 +159,22 @@ Reduce a tensor along the given axis with a minimum operation
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
 for_each(index in shape) {
-    tensor_write<in_t>(output, shape, index, maximum<in_t>);
+    tensor_write<in_out_t>(output, shape, index, maximum<in_out_t>);
 }
 for_each(index in shape1) {
     out_index = index;
     out_index[axis] = 0;
-    in_t value = tensor_read<in_t>(input, shape1, index);
-    in_t state = tensor_read<in_t>(output, shape, out_index);
-    state      = apply_min<in_t>(state, value);
-    tensor_write<in_t>(output, shape, out_index, state);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
+    state      = apply_min<in_out_t>(state, value);
+    tensor_write<in_out_t>(output, shape, out_index, state);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|signed 16|int16_t
@@ -191,9 +191,9 @@ Reduce a tensor along the given axis by computing the product of the axis.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|axis|-|Axis to reduce, in range from 0 to rank(shape1)-1
-|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
@@ -203,22 +203,22 @@ Reduce a tensor along the given axis by computing the product of the axis.
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
 for_each(index in shape) {
-    tensor_write<in_t>(output, shape, index, 1.0);
+    tensor_write<in_out_t>(output, shape, index, 1.0);
 }
 for_each(index in shape1) {
     out_index = index;
     out_index[axis] = 0;
-    in_t value = tensor_read<in_t>(input, shape1, index);
-    in_t state = tensor_read<in_t>(output, shape, out_index);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
     state      = state * value;
-    tensor_write<in_t>(output, shape, out_index, state);
+    tensor_write<in_out_t>(output, shape, out_index, state);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |MI, MT|floating-point|float_t
 |===
@@ -232,9 +232,9 @@ Reduce a tensor along the given axis by computing the sum of the axis.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Input|in_out_t*|input|shape1|Input tensor with rank from 1 to 4
 |Attribute|int32_t|axis|-|Axis to reduce, in range from 0 to rank(shape1)-1
-|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
+|Output|in_out_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
@@ -244,22 +244,22 @@ Reduce a tensor along the given axis by computing the sum of the axis.
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
 for_each(index in shape) {
-    tensor_write<in_t>(output, shape, index, 0);
+    tensor_write<in_out_t>(output, shape, index, 0);
 }
 for_each(index in shape1) {
     out_index = index;
     out_index[axis] = 0;
-    in_t value = tensor_read<in_t>(input, shape1, index);
-    in_t state = tensor_read<in_t>(output, shape, out_index);
-    state      = apply_add<in_t>(state, value);
-    tensor_write<in_t>(output, shape, out_index, state);
+    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
+    state      = apply_add<in_out_t>(state, value);
+    tensor_write<in_out_t>(output, shape, out_index, state);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 32|int32_t
 |MI, MT|floating-point|float_t
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index cfab5ba..b2f0754 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -74,21 +74,21 @@ When calculating the average, only the number of valid input tensor values, but
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|[N,IH,IW,C]|Input tensor 4D
+|Input|in_out_t*|input|[N,IH,IW,C]|Input tensor 4D
 |Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Attribute|in_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
-|Attribute|in_t|output_zp|-|Output tensor zero point. Must be zero for non-int8 types.
-|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
+|Attribute|in_out_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
+|Attribute|in_out_t|output_zp|-|Output tensor zero point. Must be zero for non-int8 types.
+|Output|in_out_t*|output|[N,H,W,C]|Output tensor 4D
 |===
 
 *Operation Function:*
 
 [source,c++]
 ----
-ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t
-ERROR_IF(in_t != int8_t && output_zp != 0); // Zero point only for int8_t
+ERROR_IF(in_out_t != int8_t && input_zp != 0); // Zero point only for int8_t
+ERROR_IF(in_out_t != int8_t && output_zp != 0); // Zero point only for int8_t
 ERROR_IF(kernel_y < 1 || kernel_x < 1); // kernel size must be >= 1
 ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0);
@@ -102,7 +102,7 @@ ERROR_IF(H != idiv((IH + pad_top + pad_bottom + stride_y - kernel_y), stride_y))
 ERROR_IF(W != idiv((IW + pad_left + pad_right + stride_x - kernel_x), stride_x));
 
 for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
-    in_t output_val;
+    in_out_t output_val;
     acc_t acc = 0;
     int count = 0;
     iy = oy * stride_y - pad_top;
@@ -114,25 +114,25 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
         // average, padding does not count
         if (0 <= y < IH and 0 <= x < IW) {
             count++;
-            acc_t value = tensor_read<in_t>(input, [N,IH,IW,C], [n,y,x,c]);
+            acc_t value = tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]);
             value = value - input_zp;
             acc = apply_add<acc_t>(acc, value);
         }
     }
-    if (is_float(in_t)) {
+    if (is_float(in_out_t)) {
         output_val = acc / (float)count;
     } else {
         scale_t scale = reciprocal_scale(count);
         acc = apply_scale_32(acc, scale.multiplier, scale.shift, false);
-        output_val = (in_t)apply_clip<acc_t>(acc + output_zp, minimum<in_t>, maximum<in_t>)
+        output_val = (in_out_t)apply_clip<acc_t>(acc + output_zp, minimum<in_out_t>, maximum<in_out_t>)
     }
-    tensor_write<in_t>(output, [N,H,W,C], [n,oy,ox,c], output_val);
+    tensor_write<in_out_t>(output, [N,H,W,C], [n,oy,ox,c], output_val);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|in_t|acc_t
+|Profile|Mode|in_out_t|acc_t
 
 |Any|signed 8|int8_t|int32_t
 |Any|signed 16|int16_t|int32_t
@@ -150,13 +150,13 @@ Performs a 2D convolution over the given tensor input, using the weight tensor.
 
 |Input|in_t*|input|[N,IH,IW,IC]|Input tensor
 |Input (MT profile) Attribute (BI/MI profiles)|weight_t*|weight|[OC,KH,KW,IC]|Weight kernel size KH x KW
-|Input (MT profile) Attribute (BI/MI profiles)|acc_t*|bias|[OC]|Per output channel bias data.
+|Input (MT profile) Attribute (BI/MI profiles)|out_t*|bias|[OC]|Per output channel bias data.
 |Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|dilation|[2]|[dilation_y, dilation_x]
 |Attribute|in_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
 |Attribute|weight_t|weight_zp|-|Weight zero point. Must be zero for non-int8 types.
-|Output|acc_t*|output|[N,H,W,OC]|Output tensor
+|Output|out_t*|output|[N,H,W,OC]|Output tensor
 |===
 
 *Operation Function*
@@ -170,29 +170,29 @@ ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(dilation_y < 1 || dilation_x < 1);
 pad = flatten([0,0], pad, [0,0]);
 for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
-    acc_t acc = 0;
+    out_t acc = 0;
     iy = oy * stride_y - pad_top;
     ix = ox * stride_x - pad_left;
     for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            acc_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic]);
-            acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
+            out_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic]);
+            out_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
             value  = value - input_zp;
             weight = weight - weight_zp;
-            acc = apply_add<acc_t>(acc, value * weight);
+            acc = apply_add<out_t>(acc, value * weight);
         }
     }
-    acc = apply_add<acc_t>(acc, bias[oc]);
-    tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
+    acc = apply_add<out_t>(acc, bias[oc]);
+    tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|weight_t|acc_t
+|Profile|Mode|in_t|weight_t|out_t
 
 |Any|signed 8x8|int8_t|int8_t|int32_t
 |Any|signed 8x4|int8_t|int4_t|int32_t
@@ -211,13 +211,13 @@ Performs a 3D convolution over the given input tensor.
 
 |Input|in_t*|input|[N,ID,IH,IW,IC]|Input tensor
 |Input (MT profile) Attribute (BI/MI profiles)|weight_t*|weight|[OC,KD,KH,KW,IC]|Weight kernel size KDxKHxKW
-|Input (MT profile) Attribute (BI/MI profiles)|acc_t*|bias|[OC]|Per output channel bias data.
+|Input (MT profile) Attribute (BI/MI profiles)|out_t*|bias|[OC]|Per output channel bias data.
 |Attribute|int*|pad|[6]|[pad_d0, pad_d1, pad_top, pad_bottom, pad_left, pad_right]
 |Attribute|int*|stride|[3]|[stride_d, stride_y, stride_x]
 |Attribute|int*|dilation|[3]|[dilation_d, dilation_y, dilation_x]
 |Attribute|in_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
 |Attribute|weight_t|weight_zp|-|Weight zero point. Must be zero for non-int8 types.
-|Output|acc_t*|output|[N,D,H,W,OC]|Output tensor
+|Output|out_t*|output|[N,D,H,W,OC]|Output tensor
 |===
 
 *Operation Function*
@@ -231,7 +231,7 @@ ERROR_IF(stride_d < 1 || stride_y < 1 || stride_x < 1);
 ERROR_IF(dilation_d < 1 || dilation_y < 1 || dilation_x < 1);
 pad = flatten([0,0], pad, [0,0]);
 for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
-    acc_t acc = 0;
+    out_t acc = 0;
     id = od * stride_d - pad_d0;
     iy = oy * stride_y - pad_top;
     ix = ox * stride_x - pad_left;
@@ -240,22 +240,22 @@ for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
         if (0 <= x < IW && 0 <= y < IH && 0 <= d <= ID) {
-            acc_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic]);
-            acc_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic]);
+            out_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic]);
+            out_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic]);
             value  = value - input_zp;
             weight = weight - weight_zp;
-            acc = apply_add<acc_t>(acc, value * weight);
+            acc = apply_add<out_t>(acc, value * weight);
         }
     }
-    acc = apply_add<acc_t>(acc, bias[oc]);
-    tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc);
+    acc = apply_add<out_t>(acc, bias[oc]);
+    tensor_write<out_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|weight_t|acc_t
+|Profile|Mode|in_t|weight_t|out_t
 
 |Any|signed 8x8|int8_t|int8_t|int32_t
 |Any|signed 8x4|int8_t|int4_t|int32_t
@@ -275,13 +275,13 @@ Performs 2D convolutions separately over each channel of the given tensor input,
 
 |Input|in_t*|input|[N,H,W,C]|Input tensor
 |Input (MT profile) Attribute (BI/MI profiles)|weight_t*|weight|[KH,KW,C,M]|Weight kernel size KH x KW
-|Input (MT profile) Attribute (BI/MI profiles)|acc_t*|bias|[C*M]|Per output channel bias data.
+|Input (MT profile) Attribute (BI/MI profiles)|out_t*|bias|[C*M]|Per output channel bias data.
 |Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|dilation|[2]|[dilation_y, dilation_x]
 |Attribute|in_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
 |Attribute|weight_t|weight_zp|-|Weight zero point. Must be zero for non-int8 types.
-|Output|acc_t*|output|[N,H,W,C*M]|Output tensor
+|Output|out_t*|output|[N,H,W,C*M]|Output tensor
 |===
 
 *Operation Function*
@@ -295,29 +295,29 @@ ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(dilation_y < 1 || dilation_x < 1);
 pad = flatten([0,0], pad, [0,0]);
 for_each(0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
-    acc_t acc = 0;
+    out_t acc = 0;
     iy = oy * stride_y - pad_top;
     ix = ox * stride_x - pad_left;
     for_each(0 <= ky < KH, 0 <= kx < KW) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            acc_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c]);
-            acc_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m]);
+            out_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c]);
+            out_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m]);
             value  = value - input_zp;
             weight = weight - weight_zp;
-            acc = apply_add<acc_t>(acc, value * weight);
+            acc = apply_add<out_t>(acc, value * weight);
         }
     }
-    acc = apply_add<acc_t>(acc, bias[(c * M) + m]);
-    tensor_write<acc_t>(output, [N,H,W,C * M], [n,oy,ox,c * M + m], acc);
+    acc = apply_add<out_t>(acc, bias[(c * M) + m]);
+    tensor_write<out_t>(output, [N,H,W,C * M], [n,oy,ox,c * M + m], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|weight_t|acc_t
+|Profile|Mode|in_t|weight_t|out_t
 
 |Any|signed 8x8|int8_t|int8_t|int32_t
 |Any|signed 8x4|int8_t|int4_t|int32_t
@@ -336,10 +336,10 @@ Performs a fully connected network.
 
 |Input|in_t*|input|[N,IC]|Input tensor
 |Attribute|weight_t*|weight|[OC,IC]|Weights
-|Attribute|acc_t*|bias|[OC]|Per output channel bias data.
+|Attribute|out_t*|bias|[OC]|Per output channel bias data.
 |Attribute|in_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
 |Attribute|weight_t|weight_zp|-|Weight zero point. Must be zero for non-int8 types.
-|Output|acc_t*|output|[N,OC]|Output tensor
+|Output|out_t*|output|[N,OC]|Output tensor
 |===
 
 *Operation Function*
@@ -349,23 +349,23 @@ Performs a fully connected network.
 ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t
 ERROR_IF(weight_t != int8_t && weight_zp != 0);
 for_each(0 <= n < N, 0 <= oc < OC) {
-    acc_t acc = 0;
+    out_t acc = 0;
     for_each(0 <= ic < IC) {
-        acc_t value  = tensor_read<in_t>(input, [N,IC], [n,ic]);
-        acc_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic]);
+        out_t value  = tensor_read<in_t>(input, [N,IC], [n,ic]);
+        out_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic]);
         value  = value - input_zp;
         weight = weight - weight_zp;
-        acc = apply_add<acc_t>(acc, value * weight);
+        acc = apply_add<out_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc]);
-    tensor_write<acc_t>(output, [N,OC], [n,oc], acc);
+    acc = apply_add<out_t>(acc, bias[oc]);
+    tensor_write<out_t>(output, [N,OC], [n,oc], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|weight_t|acc_t
+|Profile|Mode|in_t|weight_t|out_t
 
 |Any|signed 8x8|int8_t|int8_t|int32_t
 |Any|signed 8x4|int8_t|int4_t|int32_t
@@ -385,7 +385,7 @@ Performs two dimensional matrix multiplications. This allows both inputs to be a
 |Input|in_t*|B|[N,C,W]|Input tensor B, N matrices of size CxW
 |Attribute|in_t|A_zp|-|Input tensor A zero point. Must be zero for non-int8 types.
 |Attribute|in_t|B_zp|-|Input tensor B zero point. Must be zero for non-int8 types.
-|Output|acc_t*|output|[N,H,W]|Output tensor, N matrices of size HxW
+|Output|out_t*|output|[N,H,W]|Output tensor, N matrices of size HxW
 |===
 
 *Operation Function*
@@ -394,22 +394,22 @@ Performs two dimensional matrix multiplications. This allows both inputs to be a
 ----
 ERROR_IF(in_t != int8_t && (A_zp != 0 || B_zp != 0)); // Zero point only for int8_t
 for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
-    acc_t acc = 0;
+    out_t acc = 0;
     for_each(0 <= c < C) {
-        acc_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c]);
-        acc_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w]);
+        out_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c]);
+        out_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w]);
         value1 = value1 - A_zp;
         value2 = value2 - B_zp;
-        acc = apply_add<acc_t>(acc, value1 * value2);
+        acc = apply_add<out_t>(acc, value1 * value2);
     }
-    tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc);
+    tensor_write<out_t>(output, [N,H,W], [n,h,w], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|acc_t
+|Profile|Mode|in_t|out_t
 
 |Any|signed 8x8|int8_t|int32_t
 |Any|signed 16x16|int16_t|int48_t
@@ -424,11 +424,11 @@ This performs a max pooling over the given input tensor. A sliding window of siz
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|[N,IH,IW,C]|Input tensor 4D
+|Input|in_out_t*|input|[N,IH,IW,C]|Input tensor 4D
 |Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
+|Output|in_out_t*|output|[N,H,W,C]|Output tensor 4D
 |===
 
 *Operation Function:*
@@ -448,25 +448,25 @@ ERROR_IF(H != idiv((IH + pad_top + pad_bottom + stride_y - kernel_y), stride_y))
 ERROR_IF(W != idiv((IW + pad_left + pad_right + stride_x - kernel_x), stride_x));
 
 for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
-    in_t acc = minimum_value<in_t>;
+    in_out_t acc = minimum_value<in_out_t>;
     iy = oy * stride_y - pad_top;
     ix = ox * stride_x - pad_left;
     for_each( 0 <= ky < kernel_y, 0 <= kx < kernel_x ) {
         y = iy + ky;
         x = ix + kx;
         if (y >= 0 && y < IH && x >= 0 && x < IW) {
-            in_t value = tensor_read<in_t>(input, [N,IH,IW,C], [n,y,x,c]);
+            in_out_t value = tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]);
             acc = apply_max(acc, value);
         }
     }
-    tensor_write<in_t>(output, [N,H,W,C], [n,oy,ox,c], acc);
+    tensor_write<in_out_t>(output, [N,H,W,C], [n,oy,ox,c], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_out_t
 
 |Any|signed 8|int8_t
 |Any|16-bit|int16_t
@@ -484,13 +484,13 @@ Performs a 2D transposed convolution over the given tensor input, using the weig
 
 |Input|in_t*|input|[N,IH,IW,IC]|Input tensor
 |Input (MT profile) Attribute (BI/MI profiles)|weight_t*|weight|[OC,KH,KW,IC]|Weight kernel size KH x KW
-|Input (MT profile) Attribute (BI/MI profiles)|acc_t*|bias|[OC]|Per output channel bias data.
+|Input (MT profile) Attribute (BI/MI profiles)|out_t*|bias|[OC]|Per output channel bias data.
 |Attribute|int*|out_pad|[2]|[out_pad_top, out_pad_left]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|out_shape|[4]|[N,OH,OW,OC]
 |Attribute|in_t|input_zp|-|Input tensor zero point. Must be zero for non-int8 types.
 |Attribute|weight_t|weight_zp|-|Weight zero point. Must be zero for non-int8 types.
-|Output|acc_t*|output|[N,OH,OW,OC]|Output tensor
+|Output|out_t*|output|[N,OH,OW,OC]|Output tensor
 |===
 
 *Operation Function*
@@ -502,20 +502,20 @@ ERROR_IF(weight_t != int8_t && weight_zp != 0);
 ERROR_IF(out_pad_top < 0 || out_pad_left < 0);
 ERROR_IF(stride_y < 1 || stride_x < 1);
 for_each(index in out_shape) {
-    tensor_write<acc_t>(output, [N,OH,OW,OC], index, bias[index[3]])
+    tensor_write<out_t>(output, [N,OH,OW,OC], index, bias[index[3]])
 }
 for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
           0 <= ic < IC, 0 <= ky < KH,  0 <= kx < KW) {
     oy = iy * stride_y - out_pad_top  + ky;
     ox = ix * stride_x - out_pad_left + kx;
     if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
-        acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
-        acc_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]);
-        acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
+        out_t acc = tensor_read<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
+        out_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]);
+        out_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
         value = value - input_zp;
         weight = weight - weight_zp;
-        acc = apply_add<acc_t>(acc, value * weight);
-        tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
+        acc = apply_add<out_t>(acc, value * weight);
+        tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
     }
 }
 ----
@@ -523,7 +523,7 @@ for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|weight_t|acc_t
+|Profile|Mode|in_t|weight_t|out_t
 
 |Any|signed 8x8|int8_t|int8_t|int32_t
 |Any|signed 8x4|int8_t|int4_t|int32_t
-- 
cgit v1.2.1