Consistency cleanup

Attempt to get consistent across the pseudocode. Change the data types to all be intN_t instead of some cases of intN. Use float_t as the general floating point data type. Be consistent on use of the term "floating-point" Move general pseudocode helpers to their own section. Change-Id: Ie77666cd3ee438c71f39c62b9c424fe687b0bb51 Signed-off-by: Eric Kunze <eric.kunze@arm.com>
author: Eric Kunze <eric.kunze@arm.com> 2021-02-17 19:23:39 -0800
committer: Eric Kunze <eric.kunze@arm.com> 2021-03-08 10:06:31 -0800
commit: 1e9ba65f263a15f1f9cf9b9484047ea51237187a (patch)
tree: bf1eb0f43d24b207612e6e8a87799a211ba155a4
parent: 54ff87d31637c97958ac49e40312e9b6de0a8f1a (diff)
download: specification-1e9ba65f263a15f1f9cf9b9484047ea51237187a.tar.gz
14 files changed, 851 insertions, 812 deletions
diff --git a/chapters/activation_funcs.adoc b/chapters/activation_funcs.adoc
index 3fc8bc0..5af849d 100644
--- a/chapters/activation_funcs.adoc
+++ b/chapters/activation_funcs.adoc
@@ -18,30 +18,29 @@ Clamp to an arbitrary minimum and maximum value. Note that the maximum and minim
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor with rank from 1 to 4
+|Input|in_t*|Input|shape|Input tensor
 |Attribute|in_t|min_val|-|minimum clip value
 |Attribute|in_t|max_val|-|maximum clip value
-|Output|out_t*|Output|shape|Output tensor of same type and shape as input
+|Output|in_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Operation Function:*
 ....
-assert(rank(shape)<=4)
-for_each (index in shape) {
-    value = tensor_read<in_t>(input, shape, index)
-    acc = apply_clip(value, min_val, max_val)
-    tensor_write<out_t>(output, shape, index, acc)
+for_each(index in shape) {
+    value = tensor_read<in_t>(input, shape, index);
+    acc = apply_clip<in_t>(value, min_val, max_val);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ....
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 8|int8 |int8
-|Any|signed 16|int16|int16
-|MI, MT|float|float|float
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== RELUN
@@ -53,19 +52,19 @@ ReLU with a scalar maximum value.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor with rank from 1 to 4
+|Input|in_t*|Input|shape|Input tensor
 |Attribute|in_t|max_val|-|maximum clip value
-|Output|out_t*|Output|shape|Output tensor of same type and shape as input
+|Output|in_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-for_each (index in shape) {
-    in_t value = tensor_read<in_t>(input, shape, index)
-    acc = apply_clip<in_t>(value, 0, max_val)
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    in_t value = tensor_read<in_t>(input, shape, index);
+    acc = apply_clip<in_t>(value, 0, max_val);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -74,8 +73,8 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== SIGMOID
@@ -90,9 +89,9 @@ The sigmoid table has 513 entries each of 16-bit precision and covering the inpu
 [source,c]
 ....
 int sigmoid_reference(int x) {|// input x range is -256 to + 256 inclusive
-    F64 v = (double)x/(double)16;
-    v = 1.0/(1.0+exp(-v));
-    return round_to_nearest(32768.0 * v);
+    F64 v = (double)x / (double)16;
+    v = 1.0/(1.0 + exp(-v));
+    return round_to_nearest_int(32768.0 * v);
 }
 
 generate_lookup_table(&sigmoid_table, &sigmoid_reference);
@@ -103,16 +102,16 @@ generate_lookup_table(&sigmoid_table, &sigmoid_reference);
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor with rank from 1 to 4
-|Output|out_t*|Output|shape|Output tensor of same type and shape as input
+|Input|in_t*|Input|shape|Input tensor
+|Output|in_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|MI, MT|float|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== TANH
@@ -130,7 +129,7 @@ int tanh_reference(int x) {  // input x range is -256 to +256 inclusive
     F64 v = (double)x/(double)32;
     v = exp(-2.0*v);
     v = (1.0-v)/(1.0+v);
-    return round_to_nearest(32768.0 * v);
+    return round_to_nearest_int(32768.0 * v);
 }
 
 generate_lookup_table(&tanh_table, &tanh_reference);
@@ -141,14 +140,14 @@ generate_lookup_table(&tanh_table, &tanh_reference);
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|Input|shape|Input tensor with rank from 1 to 4
-|Output|out_t*|Output|shape|Output tensor of same type and shape as input
+|Input|in_t*|Input|shape|Input tensor
+|Output|in_t*|Output|shape|Output tensor of same type and shape as input
 |===
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|MI, MT|float|float|float
+|MI, MT|floating-point|float_t
 |===
diff --git a/chapters/comparison.adoc b/chapters/comparison.adoc
index f9439fc..289196f 100644
--- a/chapters/comparison.adoc
+++ b/chapters/comparison.adoc
@@ -27,13 +27,13 @@ Elementwise comparison operation
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    int32_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    int32_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    bool_t acc = (value1 == value2) ? True : False
-    tensor_write<out_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    int32_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    int32_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    bool_t acc = (value1 == value2) ? True : False;
+    tensor_write<out_t>(output, shape, index, acc);
 }
 ----
 
@@ -42,8 +42,8 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 32|int32|bool
-|MI, MT|float|float|bool
+|Any|signed 32|int32_t|bool_t
+|MI, MT|floating-point|float_t|bool_t
 |===
 
 ==== GREATER
@@ -63,13 +63,13 @@ Elementwise greater than comparison operation
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    int32_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    int32_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    bool_t acc = (value1 > value2) ? True : False
-    tensor_write<out_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    int32_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    int32_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    bool_t acc = (value1 > value2) ? True : False;
+    tensor_write<out_t>(output, shape, index, acc);
 }
 ----
 
@@ -77,8 +77,8 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 32|int32|bool
-|MI, MT|float|float|bool
+|Any|signed 32|int32_t|bool_t
+|MI, MT|floating-point|float_t|bool_t
 |===
 
 ==== GREATER_EQUAL
@@ -99,13 +99,13 @@ Elementwise comparison operation
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    int32_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    int32_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    bool_t acc = (value1 >= value2) ? True : False
-    tensor_write<out_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    int32_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    int32_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    bool_t acc = (value1 >= value2) ? True : False;
+    tensor_write<out_t>(output, shape, index, acc);
 }
 ----
 
@@ -114,6 +114,6 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 32|int32|bool
-|MI, MT|float|float|bool
+|Any|signed 32|int32_t|bool_t
+|MI, MT|floating-point|float_t|bool_t
 |===
diff --git a/chapters/control_flow.adoc b/chapters/control_flow.adoc
index b5e305d..31bdf3c 100644
--- a/chapters/control_flow.adoc
+++ b/chapters/control_flow.adoc
@@ -37,9 +37,9 @@ assert(tensor_list_shape(output_list)==tosa_output_shape(then_graph));
 assert(tensor_list_shape(output_list)==tosa_output_shape(else_graph));
 
 if (condition) {
-    tosa_execute_graph(then_graph, input_list, output_list)
+    tosa_execute_graph(then_graph, input_list, output_list);
 } else {
-    tosa_execute_graph(else_graph, input_list, output_list)
+    tosa_execute_graph(else_graph, input_list, output_list);
 }
 ----
 
@@ -70,13 +70,13 @@ assert(tosa_output_shape(cond_graph)==tosa_list_shape([bool_t]));
 
 // The iteration number 'i' is included to give unique names to variables
 // in each iteration of the loop and is not required by implementations
-int i=0                 // iteration number
-list[i] = input_list    // copy input data as list[0]
-tosa_execute_graph(cond_graph, list[i], [condition[i]])   // initial condition
+int i=0;                 // iteration number
+list[i] = input_list;    // copy input data as list[0]
+tosa_execute_graph(cond_graph, list[i], [condition[i]]);   // initial condition
 while (condition[i]) {
-    tosa_execute_graph(body_graph, list[i], list[i+1])
-    i = i+1
-    tosa_execute_graph(cond_graph, list[i], [condition[i]])
+    tosa_execute_graph(body_graph, list[i], list[i+1]);
+    i = i+1;
+    tosa_execute_graph(cond_graph, list[i], [condition[i]]);
 }
-output_list = list[i]
+output_list = list[i];
 ----
diff --git a/chapters/data_layout.adoc b/chapters/data_layout.adoc
index 9d01e71..67484cb 100644
--- a/chapters/data_layout.adoc
+++ b/chapters/data_layout.adoc
@@ -48,11 +48,11 @@ for_each(index1 in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== PAD
@@ -81,12 +81,12 @@ Zero-pads a tensor along borders of each dimension.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = index
-    for (i=0; i<rank(shape); i++) {
-        index1[i] = index1[i] - padding[i,0]
+for_each(index in shape) {
+    index1 = index;
+    for(i = 0; i < rank(shape); i++) {
+        index1[i] = index1[i] - padding[i,0];
     }
-    in_t value = tensor_read<in_t>(input1, shape1, index1, input1_zp, padding)
+    in_t value = tensor_read<in_t>(input1, shape1, index1, input1_zp, padding);
     tensor_write<in_t>(output, shape, index, value + input1_zp);
 }
 ----
@@ -96,11 +96,11 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== RESHAPE
@@ -121,9 +121,9 @@ Returns a tensor with the same type/values as the input, with a new shape specif
 
 [source,c]
 ----
-assert(tensor_size(shape1)==tensor_size(shape))
-for (i=0; i<tensor_size(shape); i++) {
-    output[i] = input[i]
+assert(tensor_size(shape1) == tensor_size(shape));
+for(i = 0; i < tensor_size(shape); i++) {
+    output[i] = input[i];
 }
 ----
 
@@ -132,11 +132,11 @@ for (i=0; i<tensor_size(shape); i++) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== REVERSE
@@ -157,10 +157,10 @@ Returns a tensor with the same type/values as the input, with the data reversed
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(shape))
-for_each (index in shape) {
+assert(0 <= axis && axis < rank(shape));
+for_each(index in shape) {
     tmp_index = index;
-    tmp_index[axis] = shape[axis]-1-index[axis];
+    tmp_index[axis] = shape[axis] - 1 - index[axis];
     in_t value = tensor_read<in_t>(input, shape, tmp_index);
     tensor_write<in_t>(output, shape, index, value);
 }
@@ -171,16 +171,17 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== SLICE
 
-Extracts a slice of the input1 on the given axis, beginning at the start coordinates, and extending for size elements in each direction. No data conversion happens during a slice operation.
+Extracts a slice of the input1 on the given axis, beginning at the start coordinates, and extending for size elements in each direction.
+No data conversion happens during a slice operation.
 
 *Arguments:*
 |===
@@ -196,9 +197,9 @@ Extracts a slice of the input1 on the given axis, beginning at the start coordin
 
 [source,c]
 ----
-for_each (index in shape) {
+for_each(index in shape) {
     tmp_index = index;
-    for (i=0; i<rank(shape); i++) {
+    for(i = 0; i < rank(shape); i++) {
        tmp_index[i] = index[i] + start[i];
     }
     in_t value = tensor_read<in_t>(input, shape1, tmp_index);
@@ -211,11 +212,11 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== TILE
@@ -228,7 +229,7 @@ Replicates input1 multiplies times along each dimension.
 |Argument|Type|Name|Shape|Description
 
 |Input|in_t*|input1|shape1|Input tensor with rank from 1 to 4
-|Attribute|int|multiplies|[rank(shape1)]|Number of times to replicate input1 in each dimension
+|Attribute|int32_t|multiplies|[rank(shape1)]|Number of times to replicate input1 in each dimension
 |Output|in_t*|output|shape|Output tensor of same type, rank as the input tensor
 |===
 
@@ -236,11 +237,11 @@ Replicates input1 multiplies times along each dimension.
 
 [source,c]
 ----
-for_each (index in shape) {
+for_each(index in shape) {
     tmp_index = index;
-    for (i=0; i<rank(shape); i++) {
-        assert(shape1[i] * multiplies[i] == shape[i])
-        tmp_index[i] = index[i] % shape1[i]
+    for(i = 0; i < rank(shape); i++) {
+        assert(shape1[i] * multiplies[i] == shape[i]);
+        tmp_index[i] = index[i] % shape1[i];
     }
     in_t value = tensor_read<in_t>(input, shape1, tmp_index);
     tensor_write<in_t>(output, shape, index, value);
@@ -252,11 +253,11 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== TRANSPOSE
@@ -269,7 +270,7 @@ Permutes the dimensions based on perm.
 |Argument|Type|Name|Shape|Description
 
 |Input|in_t*|input1|shape1|Input tensor with rank from 1 to 4
-|Attribute|int|perms|[rank(input1)]|List of integers of length equal to the rank of input1.
+|Attribute|int32_t|perms|[rank(input1)]|List of integers of length equal to the rank of input1.
 |Output|in_t*|output|shape|Output tensor of same type, rank as the input tensor
 |===
 
@@ -277,9 +278,9 @@ Permutes the dimensions based on perm.
 
 [source,c]
 ----
-for_each (index in shape) {
+for_each(index in shape) {
     tmp_index = index;
-    for (i=0; i<rank(shape); i++) {
+    for(i = 0; i < rank(shape); i++) {
         assert(shape1[perm[i]] == shape[i])
         tmp_index[perm[i]] = index[i]
     }
@@ -293,9 +294,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
diff --git a/chapters/data_nodes.adoc b/chapters/data_nodes.adoc
index 87b0b9b..7afa984 100644
--- a/chapters/data_nodes.adoc
+++ b/chapters/data_nodes.adoc
@@ -27,11 +27,11 @@ A node containing constant data for use as the input to an operation. May hold d
 |===
 |Profile|Mode|out_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== IDENTITY
@@ -52,11 +52,11 @@ Returns a tensor with the same shape, type, and contents as the input.
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== IDENTITYN
@@ -77,11 +77,11 @@ Returns a list of tensors with the same shape, type, and contents as the input l
 |===
 |Profile|Mode|in_t
 
-|Any|Boolean|bool
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== PLACEHOLDER
@@ -100,10 +100,10 @@ A node where data will be inserted into the network at runtime. Generally used f
 |===
 |Profile|Mode|out_t
 
-|Any|Boolean|bool
-|Any|unsigned 8|uint8
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|Boolean|bool_t
+|Any|unsigned 8|uint8_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
diff --git a/chapters/ewise_binary.adoc b/chapters/ewise_binary.adoc
index e9d76f8..2b8d321 100644
--- a/chapters/ewise_binary.adoc
+++ b/chapters/ewise_binary.adoc
@@ -28,14 +28,13 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = apply_add<in_t>(value1, value2)
-    tensor_write<in_t>(output, shape, index, acc)
-}
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = apply_add<in_t>(value1, value2);
+    tensor_write<in_t>(output, shape, index, acc);
 ----
 
 *Supported Data Types:*
@@ -43,8 +42,8 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== ARITHMETIC_RIGHT_SHIFT
@@ -59,7 +58,7 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Input|bool |round |- | If true then the shift is rounded
+|Input|bool_t |round |- | If true then the shift is rounded
 |Output|in_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
@@ -67,17 +66,17 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    assert(0 <= value2 && value2 <= 31)
-    in_t acc = value1 >> value2
-    if (round==true && value2>0 && (value1>>(value2-1))&1!=0) {
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    assert(0 <= value2 && value2 <= 31);
+    in_t acc = value1 >> value2;
+    if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) {
         acc = acc + 1;
     }
-    acc = apply_clip(acc, minimum<in_t>, maximum<in_t>)
+    acc = apply_clip<in_t>(acc, minimum<in_t>, maximum<in_t>)
     tensor_write<in_t>(output, shape, index, acc)
 }
 ----
@@ -87,9 +86,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== BITWISE_AND
@@ -111,13 +110,13 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 & value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = value1 & value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -126,9 +125,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== BITWISE_OR
@@ -150,13 +149,13 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 | value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = value1 | value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -165,9 +164,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== BITWISE_XOR
@@ -189,13 +188,13 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 ^ value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = value1 ^ value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -204,9 +203,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== LOGICAL_AND
@@ -232,13 +231,13 @@ None
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 && value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = value1 && value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -247,7 +246,7 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Bool|Bool
+|Any|Bool|bool_t
 |===
 
 ==== LOGICAL_LEFT_SHIFT
@@ -269,14 +268,14 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    assert(0 <= value2 && value2 <= 31)
-    in_t acc = value1 << value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    assert(0 <= value2 && value2 <= 31);
+    in_t acc = value1 << value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -285,9 +284,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== LOGICAL_RIGHT_SHIFT
@@ -309,14 +308,14 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    assert(0 <= value2 && value2 <= 31)
-    in_t acc = (unsigned in_t)value1 >> value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    assert(0 <= value2 && value2 <= 31);
+    in_t acc = (unsigned in_t)value1 >> value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -325,9 +324,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== LOGICAL_OR
@@ -349,13 +348,13 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 || value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = value1 || value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -364,7 +363,7 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Bool|Bool
+|Any|Bool|bool_t
 |===
 
 ==== LOGICAL_XOR
@@ -377,7 +376,7 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input1|shape1|Input tensor from 1 to 4 dims
+|Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
@@ -386,13 +385,13 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 != value2
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = value1 != value2;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -401,7 +400,7 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|Bool|Bool
+|Any|Bool|bool_t
 |===
 
 ==== MAXIMUM
@@ -423,13 +422,13 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = apply_max(value1, value2)
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = apply_max(value1, value2);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -438,8 +437,8 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== MINIMUM
@@ -461,13 +460,13 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = apply_min(value1, value2)
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = apply_min(value1, value2);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -476,8 +475,8 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== MUL
@@ -492,7 +491,7 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as input1
-|Input (MT profile) Attribute (BI/MI profiles)|uint6_t|shift|-|Result right shift (int32 data type only)
+|Input (MT profile) Attribute (BI/MI profiles)|uint6_t|shift|-|Result right shift (int32_t data type only)
 |Output|out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
@@ -500,18 +499,18 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 [source,c]
 ----
-assert(in_t==int32_t || shift==0);
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
+assert(in_t == int32_t || shift == 0);
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     if (shift>0) {
-        out_t acc = apply_scale_32(value1, value2, shift)
+        out_t acc = apply_scale_32(value1, value2, shift);
     } else {
         out_t acc = value1 * value2;  // low 32-bits of result for int32_t
     }
-    tensor_write<out_t>(output, shape, index, acc)
+    tensor_write<out_t>(output, shape, index, acc);
 }
 ----
 
@@ -519,10 +518,10 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 8|int8|int32
-|Any|signed 16|int16|int32
-|Any|signed 32|int32|int32
-|MI, MT|float|float|float
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|Any|signed 32|int32_t|int32_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== POW
@@ -542,14 +541,14 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 
 *Quantization Parameters:*
 
-Only supported with floating point values.
+Only supported with floating-point values.
 
 *Supported Data Types:*
 
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== SUB
@@ -571,13 +570,13 @@ Axis of size 1 will be broadcast as necessary. Rank of input tensors must match.
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = apply_sub<out_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t acc = apply_sub<in_t>(value1, value2);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -586,25 +585,25 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ====   TABLE
 
-Interpolated table lookup operation. The int16 input is treated as a fixed-point 9.7 value. The high 9 bits are used to index into the table. The fractional 7 bits are used to interpolate based on table[index] and table[index+1]. The TABLE operator returns a 16.7 interpolated value which can then be input to the RESCALE operator to scale to the required output data type. Note that table has 513 values to handle table[index+1] when index=511.
+Interpolated table lookup operation. The int16_t input is treated as a fixed-point 9.7 value. The high 9 bits are used to index into the table. The fractional 7 bits are used to interpolate based on table[index] and table[index+1]. The TABLE operator returns a 16.7 interpolated value which can then be input to the RESCALE operator to scale to the required output data type. Note that table has 513 values to handle table[index+1] when index=511.
 
 An int8_t to int8_t table lookup can be constructed in TOSA as follows:
 
-* Use RESCALE (in_t=int8, out_t=int16, input_zp=0, scale=1<<14, shift=7) to perform a shift left of 7 and convert to int16
+* Use RESCALE (in_t=int8_t, out_t=int16_t, input_zp=0, scale=1<<14, shift=7) to perform a shift left of 7 and convert to int16_t
 * Use the TABLE operator to produce a fixed point 16.7 result. The lower 7 bits will be zero and only the central 256 table entries will be used.
-* Use RESCALE (in_t=int32, out_t=int8, scale=1<<14, shift=28) to scale the output to int8_t range (or alternate scale as required)
+* Use RESCALE (in_t=int32_t, out_t=int8_t, scale=1<<14, shift=28) to scale the output to int8_t range (or alternate scale as required)
 * Note that this TOSA sequence can be implemented in software as a 256 entry 8-bit lookup table.
 
 An int16_t to int16_t table lookup can be constructed in TOSA as follows:
 
 * Use the TABLE operator to produce a fixed point 16.7 interpolated result
-* Use RESCALE (in_t=int32, out_t=int16, scale=1<<14, shift=21) to scale the output to int16_t range (or alternate scale as required)
+* Use RESCALE (in_t=int32_t, out_t=int16_t, scale=1<<14, shift=21) to scale the output to int16_t range (or alternate scale as required)
 
 *Arguments:*
 
@@ -624,11 +623,10 @@ None
 
 [source,c]
 ----
-assert(rank(shape)<=4)
-for_each (index in shape) {
-    in_t value = tensor_read<in_t>(input, shape, index)
-    out_t acc = apply_lookup(table, value)
-    tensor_write<out_t>(output, shape, index, acc)
+for_each(index in shape) {
+    in_t value = tensor_read<in_t>(input, shape, index);
+    out_t acc = apply_lookup(table, value);
+    tensor_write<out_t>(output, shape, index, acc);
 }
 ----
 
@@ -637,6 +635,6 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|table_t|out_t
 
-|Any|signed 16|int16|int16|int32
+|Any|signed 16|int16_t|int16_t|int32_t
 |===
 
diff --git a/chapters/ewise_ternary.adoc b/chapters/ewise_ternary.adoc
index 17e966c..4d20316 100644
--- a/chapters/ewise_ternary.adoc
+++ b/chapters/ewise_ternary.adoc
@@ -32,20 +32,20 @@ None
 
 [source,c]
 ----
-for_each (index in shape) {
-    index1 = apply_broadcast(shape, shape1, index)
-    index2 = apply_broadcast(shape, shape2, index)
-    index3 = apply_broadcast(shape, shpae3, index)
-    bool_t value1 = tensor_read<in_t>(input1, shape1, index1)
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t value3 = tensor_read<in_t>(input3, shape3, index3)
-    in_t acc = 0
-    if (value1 == True){
-         acc = value2
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    index3 = apply_broadcast(shape, shpae3, index);
+    bool_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t value3 = tensor_read<in_t>(input3, shape3, index3);
+    in_t acc = 0;
+    if (value1 == True) {
+         acc = value2;
     } else {
-         acc = value3
+         acc = value3;
     }
-    tensor_write<in_t>(output, shape, index, acc)
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -53,9 +53,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|bool_t|in_t
 
-|Any|Boolean|bool|bool
-|Any|signed 8|bool|int8
-|Any|signed 16|bool|int16
-|Any|signed 32|bool|int32
-|MI, MT|float|bool|float
+|Any|Boolean|bool_t|bool_t
+|Any|signed 8|bool_t|int8_t
+|Any|signed 16|bool_t|int16_t
+|Any|signed 32|bool_t|int32_t
+|MI, MT|floating-point|bool_t|float_t
 |===
diff --git a/chapters/ewise_unary.adoc b/chapters/ewise_unary.adoc
index c9b0922..d852fa4 100644
--- a/chapters/ewise_unary.adoc
+++ b/chapters/ewise_unary.adoc
@@ -26,21 +26,21 @@ Elementwise absolute value operation
 
 [source,c]
 ----
-for_each (index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index)
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
     if (value1 < 0)
-        value1 = apply_sub<in_t>(0, value1)
-    tensor_write<in_t>(output, shape, index, value1)
+        value1 = apply_sub<in_t>(0, value1);
+    tensor_write<in_t>(output, shape, index, value1);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== BITWISE_NOT
@@ -64,10 +64,10 @@ None
 
 [source,c]
 ----
-for_each (index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index)
-    in_t acc = ~value1
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_t acc = ~value1;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -76,9 +76,9 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
 |===
 
 ==== CEIL
@@ -99,7 +99,7 @@ Elementwise ceiling operation
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== CLZ
@@ -119,11 +119,11 @@ Elementwise count leading zeros operation
 
 [source,c]
 ----
-for_each (index in shape) {
-    in_t acc = 0
-    in_t value1 = tensor_read<in_t>(input1, shape, index)
-    acc = count_leading_zeros(value1)
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    in_t acc = 0;
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    acc = count_leading_zeros(value1);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -131,7 +131,7 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 32|int32
+|Any|signed 32|int32_t
 |===
 
 ==== EXP
@@ -152,7 +152,7 @@ Elementwise e to the x operation
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== FLOOR
@@ -173,7 +173,7 @@ Elementwise floor operation
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== LOG
@@ -194,7 +194,7 @@ Elementwise natural logarithm operation
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== LOGICAL_NOT
@@ -218,10 +218,10 @@ None
 
 [source,c]
 ----
-for_each (index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape1, index)
-    in_t acc = !value1
-    tensor_write<in_t>(output, shape, index, acc)
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape1, index);
+    in_t acc = !value1;
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -230,7 +230,7 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|bool|bool
+|Any|bool|bool_t
 |===
 
 ==== NEGATE
@@ -259,13 +259,13 @@ Elementwise negation operation
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(in_t == int8_t || output_zp == 0) // Zero point only for int8
-for_each (index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index)
-    in_t acc = appl_sub<in_t>(0, value1 - input1_zp)
-    acc = apply_clip(acc, minimum<in_t>, maximum<in_t>)
-    tensor_write<in_t>(output + output_zp, shape, index, acc)
+assert(in_t == int8_t || input1_zp == 0) // Zero point only for int8_t
+assert(in_t == int8_t || output_zp == 0) // Zero point only for int8_t
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index, input1_zp);
+    in_t acc = apply_sub<in_t>(0, value1);
+    acc = apply_clip<in_t>(acc + output_zp, minimum<in_t>, maximum<in_t>);
+    tensor_write<in_t>(output, shape, index, acc);
 }
 ----
 
@@ -274,10 +274,10 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t
 
-|Any|signed 8|int8
-|Any|signed 16|int16
-|Any|signed 32|int32
-|MI, MT|float|float
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== RECIPROCAL
@@ -298,7 +298,7 @@ Elementwise reciprocal operation. For integer operation, a TABLE should be used
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== RSQRT
@@ -319,5 +319,5 @@ Elementwise reciprocal square root operation. For integer operation, a TABLE sho
 |===
 |Profile|Mode|in_t
 
-|MI, MT|float|float
+|MI, MT|floating-point|float_t
 |===
diff --git a/chapters/image.adoc b/chapters/image.adoc
index a8e0219..b84cf63 100644
--- a/chapters/image.adoc
+++ b/chapters/image.adoc
@@ -14,9 +14,9 @@
 Resizes a tensor. Resize is only allowed in the H and W dimensions.
 
 The NEAREST_NEIGHBOR mode returns the value of the input tensor closest to the
-calculated sample position for both floating point and integer data formats.
+calculated sample position for both floating-point and integer data formats.
 
-Floating point BILINEAR mode returns a bilinearly interpolated output value
+Floating-point BILINEAR mode returns a bilinearly interpolated output value
 based on the four closest input sample positions.
 
 For integer BILINEAR interpolation mode, the output value is calculated by using
@@ -25,7 +25,7 @@ factor for each input. These values are then summed to create the value for
 output, which has 2 * shift fractional bits. To convert back to the original
 integer size, the output value must be rescaled.
 
-For floating point stride, stride_y should be set to  IH/OH, stride_x should be
+For floating-point stride, stride_y should be set to  IH/OH, stride_x should be
 set to IW/OW. When using integer stride, stride_y is approximately
 (IH<<shift)/OH and stride_x is approximately (IW<<shift)/OW. OH and OW are also
 supplied as inputs since there may be off by one errors if calculating OH and OW
@@ -54,37 +54,37 @@ None
 [source,c]
 ----
 // scale assert prevents int32_t accumulator overflow for in_t==int8_t
-assert((resize_t==float && shift==0)||(0<shift && shift<=11));
-assert(stride_x>0 && stride_y>0);
-for_each (0<=n<N, 0<=oy<OH, 0<=ox<OW; 0<=c<C) {
-    unit = (resize_t==float) ? 1.0 : (1<<shift);
-    y = oy * stride_y + offset_y
-    x = ox * stride_x + offset_x
-    if (resize_t==float) {
-        iy = (int)floor(y); dy = y - (float)iy;
-        ix = (int)floor(x); dx = x - (float)ix;
+assert((resize_t == float_t && shift == 0)||(0 < shift && shift <= 11));
+assert(stride_x > 0 && stride_y > 0);
+for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= c < C) {
+    unit = (resize_t == float_t) ? 1.0 : (1 << shift);
+    y = oy * stride_y + offset_y;
+    x = ox * stride_x + offset_x;
+    if (resize_t == float_t) {
+        iy = (int)floor(y); dy = y - (float_t)iy;
+        ix = (int)floor(x); dx = x - (float_t)ix;
     } else {
         iy = y >> shift; dy = y - (iy<<shift);
         ix = x >> shift; dx = x - (ix<<shift);
     }
-    iy0 = apply_max(iy,0);
-    iy1 = apply_min(iy+1,IH-1);
-    ix0 = apply_max(ix,0);
-    ix1 = apply_min(ix+1,IW-1);
-    assert(ix0<=ix1 && iy0<=iy1);
+    iy0 = apply_max(iy, 0);
+    iy1 = apply_min(iy+1, IH-1);
+    ix0 = apply_max(ix, 0);
+    ix1 = apply_min(ix+1, IW-1);
+    assert(ix0 <= ix1 && iy0 <= iy1);
     if (mode==BILINEAR) {
-        v00 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy0,ix0,c])
-        v01 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy0,ix1,c])
-        v10 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy1,ix0,c])
-        v11 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy1,ix1,c])
-        out_t acc = v00*(unit-dy)*(unit-dx) + v01*(unit-dy)*dx
-        acc = acc + v10*dy*(unit-dx) + v11*dy*dx;
-        tensor_write<out_t>(output, [N,OH,OW,C], [n,oy,ox,c], acc)
+        v00 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy0,ix0,c]);
+        v01 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy0,ix1,c]);
+        v10 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy1,ix0,c]);
+        v11 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy1,ix1,c]);
+        out_t acc = v00 * (unit - dy) * (unit - dx) + v01 * (unit - dy) * dx;
+        acc = acc + v10 * dy * (unit-dx) + v11 * dy * dx;
+        tensor_write<out_t>(output, [N,OH,OW,C], [n,oy,ox,c], acc);
     } else if (mode==NEAREST) {
         iy = (dy >= unit/2) ? iy1 : iy0;
         ix = (dx >= unit/2) ? ix1 : ix0;
         v = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy,ix,c]);
-        tensor_write<out_t>(output, [N,OH,OW,C], [n,oy,ox,c], v)
+        tensor_write<out_t>(output, [N,OH,OW,C], [n,oy,ox,c], v);
     }
 }
 ----
@@ -94,11 +94,11 @@ for_each (0<=n<N, 0<=oy<OH, 0<=ox<OW; 0<=c<C) {
 |===
 |Profile|Mode|resize_t|in_t|out_t
 
-|Any|signed 8,  bilinear|int16|int8|int32
-|Any|signed 8,  nearest |int16|int8|int8
-|Any|signed 16, bilinear|int16|int16|int48
-|Any|signed 16, nearest |int16|int16|int16
-|MI,MT|float            |float|float|float
+|Any|signed 8,  bilinear|int16_t|int8_t|int32_t
+|Any|signed 8,  nearest |int16_t|int8_t|int8_t
+|Any|signed 16, bilinear|int16_t|int16_t|int48_t
+|Any|signed 16, nearest |int16_t|int16_t|int16_t
+|MI,MT|floating-point   |float_t|float_t|float_t
 |===
 
 *Resize Modes:*
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index ef81d29..3257ab0 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -119,63 +119,61 @@ The following data layouts are supported in TOSA. Data layouts are specified suc
 |DOHWI|Depth, Output Channels, Filter Height, Filter Width, Input Channels|Weights for 3D convolution
 |===
 
-==== Floating point
+==== Floating-point
 
-The base inference profile of TOSA requires support for the quantized integer operations. Floating point support is included in the main inference profile.
+The base inference profile of TOSA requires support for the quantized integer operations. Floating-point support is included in the main inference profile.
 
-==== Number formats
+==== Number Formats
 
-The following number formats are defined in TOSA. See section 2.3 for details on
-quantization within TOSA. The number formats supported by an operator are listed
-in a per-operator table of supported types. The integer types may be used to
-represent quantized data. For details of interpreting the quantized data, see
-the <<Quantization Scaling>> section.
+The following number formats are defined in TOSA.
+The number formats supported by an operator are listed in a per-operator table of supported types.
+The integer types may be used to represent quantized data.
+For details of interpreting the quantized data, see the <<Quantization Scaling>> section.
 
 .Number formats
-[cols="1,1,1,6"]
+[cols="1,1,1,5"]
 |===
 |Format|Minimum|Maximum|Description
 
-|bool
+|bool_t
 | -
 | -
 |Boolean value. Size implementation defined.
 
-|int4
+|int4_t
 | -7
 | +7
-|Signed 4-bit values.
+|Signed 4-bit twos-complement values.
 
-|int8
+|int8_t
 | -128
 | +127
 |Signed 8-bit twos-complement values.
 
-|uint8
+|uint8_t
 | 0
 | 255
-|Unsigned 8-bit value. This data type is only used for input/output conversion by the
-RESCALE operator and not supported by other operators.
+|Unsigned 8-bit value.
 
-|int16
+|int16_t
 | -32768
 | +32767
-|Signed  16-bit twos-complement values.
+|Signed 16-bit twos-complement values.
 
-|int32
+|int32_t
 | -(1<<31)
 | (1<<31)-1
-|32-bit twos-complement value.
+|Signed 32-bit twos-complement value.
 
-|int48
+|int48_t
 | -(1<<47)
 | (1<<47)-1
-|48-bit twos-complement value.
+|Signed 48-bit twos-complement value.
 
-|float
+|float_t
 | -infinity
 | +infinity
-|floating point number. Must have features defined in the section <<Floating Point>>. (Main inference profile)
+|floating-point number. Must have features defined in the section <<Floating-point>>.
 |===
 
 Note: In this specification minimum<type> and maximum<type> will denote the minimum and maximum values of the data as stored in memory (ignoring the zero point). The minimum and maximum values for each type is given in the preceeding table.
@@ -194,7 +192,10 @@ The following pseudocode represents the operations that will happen to data elem
 
 *Functionality of tensor read*
 If in_t is 8-bit then out_t=int16_t. Otherwise out_t is set to the same as in_t.
+If padding is specified, the size of the padding array should be 2 times the size of the shape.
+The padding array represents the before and after pair for each dimension.
 ....
+assert((pad ==  NULL) || size(pad) == 2 * size(shape));
 out_t tensor_read<in_t>(in_t *address, dim_t shape, dim_t index, in_t zero_point=0, dim_t pad=NULL) {
   assert(in_t == int8_t || zero_point == 0)
   unsigned offset = 0;
@@ -248,8 +249,11 @@ dim_t apply_broadcast(dim_t out_shape, dim_t in_shape, dim_t index) {
 
 When converting the floating-point values used in training to quantized integer values used on devices for inference, we need to know the range of values to be represented by the integers. The frameworks use slightly different parameters and data types to do this conversion. For example, TensorFlow passes a min and max floating-point values for quantization. TensorFlow Lite and PyTorch use a floating-point scale factor, and an integer zero point. TFLite and PyTorch also allow for symmetric quantization where the zero point value is not used.
 In the initial quantization work, tensors were quantized with a single set of parameters for the entire tensor. Recently, frameworks have added support for different quantization parameters on a per channel basis. This per channel quantization thus carries a vector of scales and zero points to be used on each channel. TOSA will support per channel quantization, but only for the weight tensor used in convolution operators.
-Quantization parameters in floating point cause imprecision. In some instances, the software may need to calculate post-op scaling values on hardware that does not have a floating-point unit. Arm NPUs have fixed output scaling hardware that uses fixed point arithmetic to calculate the output values. When calculating these multiplicands and shift amounts, different floating-point precisions may cause results to differ.
-To remove this dependency on floating point values, there are two design choices being made:
+Quantization parameters in floating-point cause imprecision.
+In some instances, the software may need to calculate post-op scaling values on hardware that does not have a floating-point unit.
+Arm NPUs have fixed output scaling hardware that uses fixed point arithmetic to calculate the output values.
+When calculating these multiplicands and shift amounts, different floating-point precisions may cause results to differ.
+To remove this dependency on floating-point values, there are two design choices being made:
 
 * Quantization parameters will be associated with operations rather than tensors. The operations are where the scaling is taking place, and thus can be specified such that the hardware fixed point calculations can be represented exactly, such that any hardware designed to the TOSA specification will return the same quantized values.
 * Quantization parameters will be given in integer values, as multiplicands and shifts. Specific bit widths and signed/unsignedness will be provided with each operator.
@@ -269,7 +273,7 @@ Most operations in TOSA do not contain quantization scaling in the operation, bu
 The apply_scale functions provide a scaling of approximately (multiplier * 2^-shift^). The shift range is limited to allow a variety of implementations. The upper limit of 62 allows it to be decomposed as two right shifts of 31. The lower limit removes special cases in the rounding. These restrictions have little practical impact since the shift value to achieve a scaling of 1.0 is 30 for apply_scale_32 with multiplier=1<<30 and 14 for apply_scale_16 with scale=1<<14. It follows that a scaling range of 2^+12^ down to 2^-32^ is supported for both functions with normalized multiplier. (Smaller scales can be obtained by denormalizing the multiplier).
 
 ....
-int32_t apply_scale_32(int32_t value, int32_t multipler, uint6_t shift, bool double_round=false) {
+int32_t apply_scale_32(int32_t value, int32_t multipler, uint6_t shift, bool_t double_round=false) {
   assert(multiplier >= 0);
   assert(2 <= shift && shift <= 62);
   int64_t round = 1 << (shift - 1);
@@ -317,18 +321,105 @@ scale_t reciprocal_scale(uint32_t value) {
 }
 ....
 
+==== Quantized Convolutions
+
+For convolution, the input is not required to be scaled before the convolution occurs.
+The convolution produces an accumulator output of type int32_t or int48_t.
+This accumulator output is then scaled to the final output range using the RESCALE operator.
+The scale applied in the RESCALE operator should be set to multiplier and shift values such that: multiplier * 2^-shift^ = (input scale * weight scale) / output_scale.
+Here, input_scale, weight_scale and output_scale are the conversion factors from integer to floating-point for the input, weight and output tensor values respectively.
+If per-channel scaling is needed then the per-channel option of the RESCALE operation should be used.
+
+==== Quantized Elementwise Operators
+
+When two quantized tensors are used in an operation, they must represent the same numeric range for the result to be valid.
+In this case, TOSA expects that RESCALE operators will be used as necessary to generate 32-bit integer values in a common range.
+There are many valid choices for scale factors and options for the common range.
+TOSA does not impose a requirement on which scale factors and range should be used.
+Compilers generating TOSA sequences should choose a range that allows the operation to be computed without overflow, while allowing the highest possible accuracy of the output.
+
+==== General Unary Functions
+General unary functions such as sigmoid(), tanh(), exp() for integer inputs are expressed using a lookup table and interpolation to enable efficient implementation.
+This also allows for other operations with the addition of user-supplied tables (the TABLE operation).
+All table lookups are based on the following reference lookup function that takes as input a table of 513 entries of 16 bits each.
+
+....
+int32_t apply_lookup(int16_t *table, int32_t value)
+{
+    int16_t clipped_value = apply_clip<int16_t>(value, -32768, +32767);
+    int32_t index = (clipped_value + 32768) >> 7;
+    int32_t fraction = clipped_value & 0x7f;
+    int16_t base = table[index];
+    int16_t next = table[index+1];
+    int32_t return_value = (base << 7) + (next - base) * fraction;
+    return return_value;	// return interpolated value of 16 + 7 = 23 bits
+}
+....
+
+Note that although the table lookup defined here has 16-bit precision, for 8-bit only operations an 8-bit table can be derived by applying the reference function to each of the possible 256 input values.
+The following code constructs a 513-entry table based on a reference function.
+
+....
+void generate_lookup_table(int16_t *table, int32_t (*reference)(int32_t))
+{
+    for (int i = -256; i <= 256; i++) {
+        int32_t value = (*reference)(i);
+        table[i + 256] = apply_clip<int16_t>(value, -32768, +32767)
+    }
+}
+....
+
+=== Floating-point
+
+TOSA does not define bit-exact behaviour of the floating-point type, since floating-point operation results can vary according to operation order (floating-point addition is not associative in general) and rounding behaviour.
+If a bit-exact answer is required then integer operations should be used.
+TOSA does define that the floating-point type must support the following list of features.
+These features ensure that detection of overflow and other exceptional conditions can be handled consistently.
+
+* The floating-point type must have at least 16 total bits including the sign bit
+* The floating-point type must support positive and negative infinity values
+* The floating-point type must support at least one Not-a-Number encoding (NaN)
+* The floating-point type must support signed zero
+* The floating-point type must support handling of infinities, NaNs, zeros as in the following table
+
+.floating-point behaviour
+|===
+|Case|Result
+
+|Any input operand is a NaN | a NaN
+
+|(&#177; 0) &#215; (&#177; infinity), (&#177; infinity) &#215; (&#177; 0) | a NaN
+
+|(&#177; 0) / (&#177; 0), (&#177; infinity) / (&#177; infinity) | a NaN
+
+| (+infinity) - (+infinity),  (+infinity) + (-infinity) | a NaN
+
+| Any positive overflow | + infinity
+
+| Any negative overflow | - infinity
+
+| Any positive underflow | + 0
+
+| Any negative underflow | - 0
+
+|===
+
+=== General Pseudocode Helpers
+
+This section contains general pseudocode utility functions used throughout the specification.
+
 The following functions provide basic arithmetic with asserts that values stay in the valid range supported by TOSA.
 
 ....
 acc_t apply_add<acc_t>(acc_t a, acc_t b) {
-    if (acc_t == float) return a + b;
+    if (acc_t == float_t) return a + b;
     int64_t c = (int64_t)a + (int64_t)b;
     assert(c >= minimum<acc_t> && c <= maximum<acc_t>);
     return (acc_t)c;
 }
 
 acc_t apply_sub<acc_t>(acc_t a, acc_t b) {
-    if (acc_t == float) return a - b;
+    if (acc_t == float_t) return a - b;
     int64_t c = (int64_t)a - (int64_t)b;
     assert(c >= minimum<acc_t> && c <= maximum<acc_t>);
     return (acc_t)c;
@@ -369,83 +460,32 @@ int32_t count_leading_zeros(int32_t a) {
 }
 ....
 
-==== Quantized Convolutions
-
-For convolution, the input is not required to be scaled before the convolution occurs. The convolution produces an accumulator output of type int32_t or int48_t. This accumulator output is then scaled to the final output range using the RESCALE operator. The scale applied in the RESCALE operator should be set to multiplier and shift values such that: multiplier * 2^-shift^ = (input scale * weight scale) / output_scale. Here, input_scale, weight_scale and output_scale are the conversion factors from integer to floating point for the input, weight and output tensor values respectively. If per-channel scaling is needed then the per-channel option of the RESCALE operation should be used.
-
-==== Elementwise operators
+The following definitions are used in pseudocode to do numeric conversions.
+....
+int round_to_nearest_int(float_t f)
+  Converts the floating-point value to f, with rounding to the nearest integer value.
 
-When two quantized tensors are used in an operation, they must represent the
-same numeric range for the result to be valid. In this case, TOSA expects that
-RESCALE operations will be used as necessary to generate 32-bit integer values
-in a common range. There are many valid choices for scale factors and options
-for the common range. TOSA does not impose a requirement on which scale factors
-and range should be used. Compilers generating TOSA sequences should choose a
-range that allows the operation to be computed without overflow, while allowing
-the highest possible accuracy of the output.
+float_t round_to_nearest_float(in_t f)
+  Converts the input value into floating-point, rounding to the nearest representable value.
+  The behavior for ties is implementation dependent.
 
-==== General unary functions
-General unary functions such as sigmoid(), tanh(), exp() for integer inputs are
-expressed using a lookup table and interpolation to enable efficient
-implementation. This also allows for other operations with the addition of
-user-supplied tables (the TABLE operation). All table lookups are based on the
-following reference lookup function that takes as input a table of 513 entries
-of 16 bits each.
+out_t sign_extend(in_t input)
+  Only valid for twos complement integer values where out_t has more bits than in_t.
+  Output = input
+  Replicate the top bit of input for all bits between the top bit of input and the top bit of output.
 
-....
-int32_t apply_lookup(int16_t *table, int value)
-{
-    value = apply_clip(value, -32768, +32767)
-    index = (value + 32768) >> 7
-    fraction = value & 0x7f
-    base = table[index]
-    next = table[index+1]
-    value = (base << 7) + (next - base) * fraction
-    return value;	// return interpolated value of 16 + 7 = 23 bits
-}
+out_t truncate(in_t input)
+  output is the sizeof(out_t) least significant bits in input.
 ....
 
-Note that although the table lookup defined here has 16-bit precision, for 8-bit only operations an 8-bit table can be derived by applying the reference function to each of the possible 256 input values.
-The following code constructs a 513-entry table based on a reference function.
-
+The following definition is used to flatten a list of lists into a single list
 ....
-void generate_lookup_table(int16_t *table, int (*reference)(int))
-{
-    for (int i = -256; i <= 256; i++) {
-        value = (*reference)(i);
-        table[i + 256] = clip(value, -32768, +32767)
+in_t* flatten(in_t lists[]) {
+  in_t output = [];
+  for_each(list in lists) {
+    for_each(element in list) {
+      output.append(element);
     }
+  }
 }
 ....
-
-=== Floating Point
-
-TOSA does not define bit-exact behaviour of the floating point type, since floating point operation results can vary according to operation order (floating point addition is not associative in general) and rounding behaviour. If a bit defined answer is required then integer operations should be used. TOSA does define that the floating point type must support the following list of features. These features ensure that detection of overflow and other exceptional conditions can be handled consistently.
-
-* The floating point type must have at least 16 total bits including the sign bit
-* The floating point type must support positive and negative infinity values
-* The floating point type must support at least one Not-a-Number encoding (NaN)
-* The floating point type must support signed zero
-* The floating point type must support handling of infinities, NaNs, zeros as in the following table
-
-.Floating point behaviour
-|===
-|Case|Result
-
-|Any input operand is a NaN | a NaN
-
-|(&#177; 0) &#215; (&#177; infinity), (&#177; infinity) &#215; (&#177; 0) | a NaN
-
-|(&#177; 0) / (&#177; 0), (&#177; infinity) / (&#177; infinity) | a NaN
-
-| (+infinity) - (+infinity),  (+infinity) + (-infinity) | a NaN
-
-| Any positive overflow | + infinity
-
-| Any negative overflow | - infinity
-
-| Any positive underflow | + 0
-
-| Any negative underflow | - 0
-
-|===
diff --git a/chapters/reduction.adoc b/chapters/reduction.adoc
index 21cdec5..e605386 100644
--- a/chapters/reduction.adoc
+++ b/chapters/reduction.adoc
@@ -18,36 +18,36 @@ Reduce a tensor along the given axis with a logical AND operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|in_shape|Input tensor with rank from 1 to 4
-|Attribute|int|axis|-|Axis to reduce
-|Output|out_t*|output|out_shape|Output tensor. Same rank as the input tensor.
+|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Attribute|int32_t|axis|-|Axis to reduce
+|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(in_shape))
-assert(out_shape[axis]==1)
-for_each (index in out_shape) {
-    tensor_write<in_t>(output, out_shape, index, true)
+assert(0 <= axis && axis < rank(shape1));
+assert(shape[axis] == 1);
+for_each(index in shape) {
+    tensor_write<in_t>(output, shape, index, true);
 }
-for_each (index in in_shape) {
+for_each(index in shape1) {
     tmp_index = index;
     tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, in_shape, index)
-    acc   = tensor_read<in_t>(output, out_shape, tmp_index)
-    acc   = acc && value
-    tensor_write<in_t>(output, out_shape, tmp_index, acc)
+    value = tensor_read<in_t>(input, shape1, index);
+    acc   = tensor_read<in_t>(output, shape, tmp_index);
+    acc   = acc && value;
+    tensor_write<in_t>(output, shape, tmp_index, acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|Boolean|bool|bool
+|Any|Boolean|bool_t
 |===
 
 ==== REDUCE_ANY
@@ -59,36 +59,36 @@ Reduce a tensor along the given axis with a logical OR operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|in_shape|Input tensor with rank from 1 to 4
-|Attribute|int|axis|-|Axis to reduce
-|Output|out_t*|output|out_shape|Output tensor. Same rank as the input tensor.
+|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Attribute|int32_t|axis|-|Axis to reduce
+|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(in_shape))
-assert(out_shape[axis]==1)
-for_each (index in out_shape) {
-    tensor_write<in_t>(output, out_shape, index, false)
+assert(0 <= axis && axis < rank(shape1));
+assert(shape[axis] == 1);
+for_each(index in shape) {
+    tensor_write<in_t>(output, shape, index, false);
 }
-for_each (index in in_shape) {
+for_each(index in shape1) {
     tmp_index = index;
     tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, in_shape, index)
-    acc   = tensor_read<in_t>(output, out_shape, tmp_index)
-    acc   = acc || value
-    tensor_write<in_t>(output, out_shape, tmp_index, acc)
+    value = tensor_read<in_t>(input, shape1, index);
+    acc   = tensor_read<in_t>(output, shape, tmp_index);
+    acc   = acc || value;
+    tensor_write<in_t>(output, shape, tmp_index, acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|Boolean|bool|bool
+|Any|Boolean|bool_t
 |===
 
 ==== REDUCE_MAX
@@ -100,39 +100,39 @@ Reduce a tensor along the given axis with a maximum operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|in_shape|Input tensor with rank from 1 to 4
-|Attribute|int|axis|-|Axis to reduce
-|Output|out_t*|output|out_shape|Output tensor. Same rank as the input tensor.
+|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Attribute|int32_t|axis|-|Axis to reduce
+|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(in_shape))
-assert(out_shape[axis]==1)
-for_each (index in out_shape) {
-    tensor_write<in_t>(output, out_shape, index, minimum<in_t>)
+assert(0 <= axis && axis < rank(shape1));
+assert(shape[axis] == 1);
+for_each(index in shape) {
+    tensor_write<in_t>(output, shape, index, minimum<in_t>);
 }
-for_each (index in in_shape) {
+for_each(index in shape1) {
     tmp_index = index;
     tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, in_shape, index)
-    acc   = tensor_read<in_t>(output, out_shape, tmp_index)
-    acc   = apply_max<in_t>(acc, value)
-    tensor_write<in_t>(output, out_shape, tmp_index, acc)
+    value = tensor_read<in_t>(input, shape1, index);
+    acc   = tensor_read<in_t>(output, shape, tmp_index);
+    acc   = apply_max<in_t>(acc, value);
+    tensor_write<in_t>(output, shape, tmp_index, acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 8|int8|int8
-|Any|signed 16|int16|int16
-|Any|signed 32|int32|int32
-|MI, MT|float|float|float
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== REDUCE_MIN
@@ -143,9 +143,9 @@ Reduce a tensor along the given axis with a minimum operation
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|in_shape|Input tensor with rank from 1 to 4
-|Attribute|int|axis|-|Axis to reduce
-|Output|out_t*|output|out_shape|Output tensor. Same rank as the input tensor.
+|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Attribute|int32_t|axis|-|Axis to reduce
+|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Quantization Parameters:*
@@ -156,30 +156,30 @@ Quantization is ignored when doing the REDUCE_MIN operation. The input and outpu
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(in_shape))
-assert(out_shape[axis]==1)
-for_each (index in out_shape) {
-    tensor_write<in_t>(output, out_shape, index, maximum<in_t>)
+assert(0 <= axis && axis < rank(shape1));
+assert(shape[axis]==1);
+for_each(index in shape) {
+    tensor_write<in_t>(output, shape, index, maximum<in_t>);
 }
-for_each (index in in_shape) {
+for_each(index in shape1) {
     tmp_index = index;
     tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, in_shape, index)
-    acc   = tensor_read<in_t>(output, out_shape, tmp_index)
-    acc   = apply_min<in_t>(acc, value)
-    tensor_write<in_t>(output, out_shape, tmp_index, acc)
+    value = tensor_read<in_t>(input, shape1, index);
+    acc   = tensor_read<in_t>(output, shape, tmp_index);
+    acc   = apply_min<in_t>(acc, value);
+    tensor_write<in_t>(output, shape, tmp_index, acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 8|int8|int8
-|Any|signed 16|int16|int16
-|Any|signed 32|int32|int32
-|MI, MT|float|float|float
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== REDUCE_PRODUCT
@@ -191,36 +191,36 @@ Reduce a tensor along the given axis by computing the product of the axis.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|in_shape|Input tensor with rank from 1 to 4
-|Attribute|int|axis|-|Axis to reduce
-|Output|out_t*|output|out_shape|Output tensor. Same rank as the input tensor.
+|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Attribute|int32_t|axis|-|Axis to reduce
+|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(in_shape))
-assert(out_shape[axis]==1)
-for_each (index in out_shape) {
-    tensor_write<in_t>(output, out_shape, index, 1.0)
+assert(0 <= axis && axis < rank(shape1));
+assert(shape[axis] == 1);
+for_each(index in shape) {
+    tensor_write<in_t>(output, shape, index, 1.0);
 }
-for_each (index in in_shape) {
+for_each(index in shape1) {
     tmp_index = index;
     tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, in_shape, index)
-    acc   = tensor_read<in_t>(output, out_shape, tmp_index)
-    acc   = acc * value
-    tensor_write<in_t>(output, out_shape, tmp_index, acc)
+    value = tensor_read<in_t>(input, shape1, index);
+    acc   = tensor_read<in_t>(output, shape, tmp_index);
+    acc   = acc * value;
+    tensor_write<in_t>(output, shape, tmp_index, acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|MI, MT|float|float|float
+|MI, MT|floating-point|float_t
 |===
 
 ==== REDUCE_SUM
@@ -232,36 +232,36 @@ Reduce a tensor along the given axis by computing the sum of the axis.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|in_shape|Input tensor with rank from 1 to 4
-|Attribute|int|axis|-|Axis to reduce
-|Output|out_t*|output|out_shape|Output tensor. Same rank as the input tensor.
+|Input|in_t*|input|shape1|Input tensor with rank from 1 to 4
+|Attribute|int32_t|axis|-|Axis to reduce
+|Output|in_t*|output|shape|Output tensor. Same rank as the input tensor.
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(0<=axis && axis<rank(in_shape))
-assert(out_shape[axis]==1)
-for_each (index in out_shape) {
-    tensor_write<in_t>(output, out_shape, index, 0)
+assert(0 <= axis && axis < rank(shape1));
+assert(shape[axis] == 1);
+for_each(index in shape) {
+    tensor_write<in_t>(output, shape, index, 0);
 }
-for_each (index in in_shape) {
+for_each(index in shape1) {
     tmp_index = index;
     tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, in_shape, index)
-    acc   = tensor_read<in_t>(output, out_shape, tmp_index)
-    acc   = apply_add<in_t>(acc, value)
-    tensor_write<in_t>(output, out_shape, tmp_index, acc)
+    value = tensor_read<in_t>(input, shape1, index);
+    acc   = tensor_read<in_t>(output, shape, tmp_index);
+    acc   = apply_add<in_t>(acc, value);
+    tensor_write<in_t>(output, shape, tmp_index, acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 32|int32|int32
-|MI, MT|float|float|float
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
diff --git a/chapters/scatter_gather.adoc b/chapters/scatter_gather.adoc
index a026335..c2fcbe4 100644
--- a/chapters/scatter_gather.adoc
+++ b/chapters/scatter_gather.adoc
@@ -32,11 +32,11 @@ None
 
 [source,c]
 ----
-for_each(0<=n<N, 0<=w<W, 0<=c<C) {
-    index_t k = tensor_read<index_t>(indices, [N,W], [n,w])
-    assert(0<=k && k<K)
-    value_t value = tensor_read<value_t>(values, [N,K,C], [n, k, c])
-    tensor_write<value_t>(output, [N,W,C], [n,w,c], value)
+for_each(0 <= n < N, 0 <= w < W, 0 <= c < C) {
+    index_t k = tensor_read<index_t>(indices, [N,W], [n,w]);
+    assert(0 <= k && k < K);
+    value_t value = tensor_read<value_t>(values, [N,K,C], [n,k,c]);
+    tensor_write<value_t>(output, [N,W,C], [n,w,c], value);
 }
 ----
 
@@ -45,10 +45,10 @@ for_each(0<=n<N, 0<=w<W, 0<=c<C) {
 |===
 |Profile|Mode|index_t|value_t
 
-|Any|signed 8|int32|int8
-|Any|signed 16|int32|int16
-|Any|signed 32|int32|int32
-|MI,MT|float|int32|float
+|Any|signed 8|int32_t|int8_t
+|Any|signed 16|int32_t|int16_t
+|Any|signed 32|int32_t|int32_t
+|MI,MT|float|int32_t|float
 |===
 
 ==== SCATTER
@@ -80,24 +80,24 @@ None
 
 // The following array is used to check compliance that an output position
 // is modified at most once.
-bool output_modified[N,K,C];
+bool_t output_modified[N,K,C];
 
 // Copy the values_in tensor to the values_out tensor.
 // Values not written by the scatter operation are unchanged in the output.
-for_each(0<=n<N, 0<=k<K, 0<=c<C) {
-    value_t value = tensor_read<value_t>(values_in, [N,K,C], [n,k,c])
-    tensor_write<value_t>(values_out, [N,K,C], [n, k, c], value)
+for_each(0 <= n < N, 0 <= k < K, 0 <= c < C) {
+    value_t value = tensor_read<value_t>(values_in, [N,K,C], [n,k,c]);
+    tensor_write<value_t>(values_out, [N,K,C], [n, k, c], value);
     output_modified[n,k,c]=false;
 }
 
 // Now perform the SCATTER operation, modifying the positions from the indices tensor
-for_each(0<=n<N, 0<=w<W, 0<=c<C) {
-    index_t k = tensor_read<index_t>(indices, [N,W], [n,w])
-    assert(0<=k && k<K)
-    assert(output_modified[n,k,c]==false);
-    value_t value = tensor_read<value_t>(input, [N,W,C], [n,w,c])
-    tensor_write<value_t>(values_out, [N,K,C], [n, k, c], value)
-    output_modified[n,k,c]=true;
+for_each(0 <= n < N, 0 <= w < W, 0 <= c < C) {
+    index_t k = tensor_read<index_t>(indices, [N,W], [n,w]);
+    assert(0 <= k && k < K);
+    assert(output_modified[n,k,c] == false);
+    value_t value = tensor_read<value_t>(input, [N,W,C], [n,w,c]);
+    tensor_write<value_t>(values_out, [N,K,C], [n, k, c], value);
+    output_modified[n,k,c] = true;
 }
 ----
 
@@ -106,8 +106,8 @@ for_each(0<=n<N, 0<=w<W, 0<=c<C) {
 |===
 |Profile|Mode|index_t|value_t
 
-|Any|signed 8|int32|int8
-|Any|signed 16|int32|int16
-|Any|signed 32|int32|int32
-|MI,MT|float|int32|float
+|Any|signed 8|int32_t|int8_t
+|Any|signed 16|int32_t|int16_t
+|Any|signed 32|int32_t|int32_t
+|MI,MT|float|int32_t|float
 |===
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index 571b9aa..341f51d 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -18,9 +18,9 @@ This returns the index with the largest value across the given axis of the input
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|input_shape|Input tensor dimension k \<=4
+|Input|in_t*|input|shape1|Input tensor dimension k \<=4
 |Attribute|int|axis|-|Axis in range 0 to k-1
-|Output|out_t*|output|output_shape|Output tensor dimension k-1
+|Output|out_t*|output|shape|Output tensor dimension k-1
 |===
 
 *Quantization Parameters:*
@@ -31,20 +31,20 @@ None
 
 [source,c]
 ----
-assert(axis >= 0 && axis < k && k <=4)
-left_shape = input_shape[0:axis-1]
-right_shape = input_shape[axis+1:k-1]
-assert( concat(left_shape, right_shape) == output_shape )
-for_each ( left_index in left_shape, right_index in right_shape )
-    in_t max_value = minimum_value<in_t>
-    int32 max_index = 0;
-    for (i=0; i<shape[axis]; i++) {
-        index = concat(left_index, [i], right_index)
-        in_t value = tensor_read<in_t>(input, input_shape, index)
+assert(axis >= 0 && axis < k && k <=4);
+left_shape = shape1[0:axis-1];
+right_shape = shape1[axis+1:k-1];
+assert(flatten(left_shape, right_shape) == shape);
+for_each(left_index in left_shape, right_index in right_shape )
+    in_t max_value = minimum_value<in_t>;
+    int32_t max_index = 0;
+    for (i = 0; i < shape[axis]; i++) {
+        index = flatten(left_index, [i], right_index);
+        in_t value = tensor_read<in_t>(input, shape1, index);
         if (value > max_value) { max_value = value; max_index=i; }
     }
-    index = concat(left_index, right_index)
-    tensor_write<int32_t>(output, output_shape, index, max_index)
+    index = flatten(left_index, right_index);
+    tensor_write<int32_t>(output, shape, index, max_index);
 }
 ----
 
@@ -53,9 +53,9 @@ for_each ( left_index in left_shape, right_index in right_shape )
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 8|int8|int32
-|Any|signed 16|int16|int32
-|MI, MT|float|float|int32
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|MI, MT|floating-point|float_t|int32_t
 |===
 
 ==== AVG_POOL2D
@@ -67,11 +67,11 @@ This performs an average pooling over the given input tensor. A sliding window o
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t *|input|[N,H,W,C]|Input tensor 4D
-|Attribute|int *|kernel|[2]|[kernel_y, kernel_x]
-|Attribute|int *|stride|[2]|[stride_y, stride_x]
-|Attribute|int *|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|out_t *|output|[N,H,W,C]|Output tensor 4D
+|Input|in_t*|input|[N,H,W,C]|Input tensor 4D
+|Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
+|Attribute|int*|stride|[2]|[stride_y, stride_x]
+|Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
+|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
 |===
 
 *Quantization Parameters:*
@@ -80,46 +80,47 @@ This performs an average pooling over the given input tensor. A sliding window o
 |Argument|Type|Name|Shape|Description
 
 |Attribute|in_t|input_zp|-|Input tensor zero point
-|Attribute|out_t|output_zp|-|Output tensor zero point
+|Attribute|in_t|output_zp|-|Output tensor zero point
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(out_t == int8_t || output_zp == 0) // Zero point only for int8
-pad=concat([0,0],pad,[0,0])
-for_each ( 0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(in_t == int8_t || output_zp == 0); // Zero point only for int8_t
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+    in_t output_val;
     acc_t acc = 0;
     int count = 0;
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each ( 0 <= ky < kernel_y, 0 <= kx < kernel_x) {
-        y = iy + ky
-        x = ix + kx
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad)
-        acc = apply_add<acc_t>(acc, value)
-        if (0<=y<IH and 0<=x<IW) count++
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= ky < kernel_y, 0 <= kx < kernel_x) {
+        y = iy + ky;
+        x = ix + kx;
+        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad);
+        acc = apply_add<acc_t>(acc, value);
+        if (0 <= y < IH and 0 <= x < IW) count++
     }
     if (is_float(out_t)) {
-      value = value / (float)count;
+        output_val = acc / (float)count;
     } else {
-      scale_t scale = reciprocal_scale(count)
-      acc = apply_scale_32(acc, scale.multiplier, scale.shift, false)
-      acc = apply_clip(acc + output_zp, output_min, output_max)
+        scale_t scale = reciprocal_scale(count);
+        acc = apply_scale_32(acc, scale.multiplier, scale.shift, false);
+        output_val = apply_clip<in_t>(acc + output_zp, minimum<in_t>, maximum<in_t>)
     }
-    tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+    tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], output_val);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|in_t|acc_t|out_t
+|Profile|Mode|in_t|acc_t
 
-|Any|signed 8|int8|int32_t|int8
-|Any|signed 16|int16|int32_t|int16
-|MI, MT|float|float|float|float
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== CONV2D
@@ -153,22 +154,22 @@ Performs a 2D convolution over the given tensor input, using the weight tensor.
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
-    acc_t acc = 0
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each (0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
-        y = iy + ky * dilation_y
-        x = ix + kx * dilation_x
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad)
-        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
+    acc_t acc = 0;
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+        y = iy + ky * dilation_y;
+        x = ix + kx * dilation_x;
+        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad);
+        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc])
-    tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+    acc = apply_add<acc_t>(acc, bias[oc]);
+    tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
 }
 ----
 
@@ -177,10 +178,10 @@ for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 ==== CONV3D
@@ -214,24 +215,24 @@ Performs a 3D convolution over the given input tensor.
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
-    acc_t acc = 0
-    id = od * stride_d - pad_d0
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each (0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
-        d = id + kd * dilation_d
-        y = iy + ky * dilation_y
-        x = ix + kx * dilation_x
-        in_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad)
-        weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
+    acc_t acc = 0;
+    id = od * stride_d - pad_d0;
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+        d = id + kd * dilation_d;
+        y = iy + ky * dilation_y;
+        x = ix + kx * dilation_x;
+        in_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad);
+        weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc])
-    tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc)
+    acc = apply_add<acc_t>(acc, bias[oc]);
+    tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc);
 }
 ----
 
@@ -240,10 +241,10 @@ for_each (0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 
@@ -278,22 +279,22 @@ Performs 2D convolutions separately over each channel of the given tensor input,
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
-    acc_t acc = 0
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each (0 <= ky < KH, 0 <= kx < KW) {
-        y = iy + ky * dilation_y
-        x = ix + kx * dilation_x
-        in_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad)
-        weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
+    acc_t acc = 0;
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= ky < KH, 0 <= kx < KW) {
+        y = iy + ky * dilation_y;
+        x = ix + kx * dilation_x;
+        in_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad);
+        weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[(c*M) + m])
-    tensor_write<acc_t>(output, [N,H,W,C*M], [n,oy,ox,c*M+m], acc)
+    acc = apply_add<acc_t>(acc, bias[(c * M) + m]);
+    tensor_write<acc_t>(output, [N,H,W,C * M], [n,oy,ox,c * M + m], acc);
 }
 ----
 
@@ -302,10 +303,10 @@ for_each (0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 ==== FULLY_CONNECTED
@@ -336,17 +337,17 @@ Performs a fully connected network.
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-for_each (0 <= n < N, 0 <= oc < OC) {
-    acc_t acc = 0
-    for_each (0 <= ic < IC) {
-        in_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp)
-        weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+for_each(0 <= n < N, 0 <= oc < OC) {
+    acc_t acc = 0;
+    for_each(0 <= ic < IC) {
+        in_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp);
+        weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc])
-    tensor_write<acc_t>(output, [N,OC], [n,oc], acc)
+    acc = apply_add<acc_t>(acc, bias[oc]);
+    tensor_write<acc_t>(output, [N,OC], [n,oc], acc);
 }
 ----
 
@@ -355,10 +356,10 @@ for_each (0 <= n < N, 0 <= oc < OC) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8 |int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8 |int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 ==== MATMUL
@@ -387,15 +388,15 @@ Performs two dimensional matrix multiplications. This allows both inputs to be a
 
 [source,c]
 ----
-assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)) // Zero point only for int8
-for_each (0 <= n < N, 0 <= h < H, 0 <= w < W) {
-    acc_t acc = 0
-    for_each (0 <= c < C) {
-        in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp)
-        in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp)
-        acc = apply_add<acc_t>(acc, value1 * value2)
+assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)); // Zero point only for int8_t
+for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
+    acc_t acc = 0;
+    for_each(0 <= c < C) {
+        in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp);
+        in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp);
+        acc = apply_add<acc_t>(acc, value1 * value2);
     }
-    tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc)
+    tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc);
 }
 ----
 
@@ -404,9 +405,9 @@ for_each (0 <= n < N, 0 <= h < H, 0 <= w < W) {
 |===
 |Profile|Mode|in_t|acc_t
 
-|Any|signed 8x8|int8|int32
-|Any|signed 16x16|int16|int48
-|MI, MT|float|float|float
+|Any|signed 8x8|int8_t|int32_t
+|Any|signed 16x16|int16_t|int48_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== MAX_POOL2D
@@ -421,7 +422,7 @@ This performs a max pooling over the given input tensor. A sliding window of siz
 |Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|out_t*|output|[N,H,W,C]|Output tensor 4D
+|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
 |===
 
 *Quantization Parameters:*
@@ -432,29 +433,29 @@ None
 
 [source,c]
 ----
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
     in_t acc = minimum_value<in_t>;
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each ( 0<=ky<kernel_y, 0<=kx<kernel_x ) {
-        y = iy + ky
-        x = ix + kx
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], pad)
-        acc = apply_max(acc, value)
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each( 0<=ky<kernel_y, 0<=kx<kernel_x ) {
+        y = iy + ky;
+        x = ix + kx;
+        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], pad);
+        acc = apply_max(acc, value);
     }
-    tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+    tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 8|int8|int8
-|Any|16-bit|int16|int16
-|MI, MT|float|float|float
+|Any|signed 8|int8_t
+|Any|16-bit|int16_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== TRANSPOSE_CONV2D
@@ -488,21 +489,21 @@ Performs a 2D transposed convolution over the given tensor input, using the weig
 
 [source,c]
 ----
-assert(in_t == int8_t  || input_zp == 0) // Zero point only allowed for int8
-assert(weight_t == int8_t || weight_zp == 0)
-for_each (index in out_shape) {
+assert(in_t == int8_t  || input_zp == 0); // Zero point only allowed for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+for_each(index in out_shape) {
     tensor_write<acc_t>(output, [N,OH,OW,OC], index, bias[index[3]])
 }
-for_each (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
+for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
           0 <= ic < IC, 0 <= ky < KH,  0 <= kx < KW) {
-    oy = iy * stride_y - out_pad_top  + ky
-    ox = ix * stride_x - out_pad_left + kx
-    if (oy>=0 && oy<OH && ox>=0 && ox<OW) {
-        acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc])
-        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp)
-        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
-        tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc)
+    oy = iy * stride_y - out_pad_top  + ky;
+    ox = ix * stride_x - out_pad_left + kx;
+    if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
+        acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
+        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp);
+        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
+        tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
     }
 }
 ----
@@ -512,8 +513,8 @@ for_each (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
diff --git a/chapters/type_conversion.adoc b/chapters/type_conversion.adoc
index 37630e6..8f9e255 100644
--- a/chapters/type_conversion.adoc
+++ b/chapters/type_conversion.adoc
@@ -26,18 +26,18 @@ Casts a tensor from one data type to another.
 
 [source,c]
 ....
-for_each (index in shape) {
+for_each(index in shape) {
     in_t in = tensor_read<in_t>(input, shape, index);
     out_t out;
-    if (out_t==bool) {
-        out = (in!=0) ? true : false;
-    } else if (in_t==bool) {
+    if (out_t == bool_t) {
+        out = (in != 0) ? true : false;
+    } else if (in_t == bool_t) {
         out = (in) ? 1 : 0;
-    } else if (out_t==float) {
+    } else if (out_t == float_t) {
         out = round_to_nearest_float(in);
-    } else if (in_t==float) {
-        out = apply_clip(round_to_nearest_int(in), minimum<out_t>, maximum<out_t>);
-    } else if (sizeof(out_t)>=sizeof(in_t)) {
+    } else if (in_t == float_t) {
+        out = apply_clip<out_t>(round_to_nearest_int(in), minimum<out_t>, maximum<out_t>);
+    } else if (sizeof(out_t) >= sizeof(in_t)) {
         out = sign_extend(in);
     } else {
         out = truncate(in);
@@ -51,24 +51,24 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|bool to signed 8|bool|int8
-|Any|bool to signed 16|bool|int16
-|Any|bool to signed 32|bool|int32
-|Any|signed 8 to bool|int8|bool
-|Any|signed 8 to signed 16|int8|int16
-|Any|signed 8 to signed 32|int8|int32
-|Any|signed 8 to float|int8|float
-|Any|signed 16 to bool|int16|bool
-|Any|signed 16 to signed 8|int16|int8
-|Any|signed 16 to signed 32|int16|int32
-|Any|signed 16 to float|int16|float
-|Any|signed 32 to bool|int32|bool
-|Any|signed 32 to signed 8|int32|int8
-|Any|signed 32 to signed 16|int32|int16
-|Any|signed 32 to float|int32|float
-|Any|float to signed 8|float|int8
-|Any|float to signed 16|float|int16
-|Any|float to signed 32|float|int32
+|Any|bool to signed 8|bool_t|int8_t
+|Any|bool to signed 16|bool_t|int16_t
+|Any|bool to signed 32|bool_t|int32_t
+|Any|signed 8 to bool|int8_t|bool_t
+|Any|signed 8 to signed 16|int8_t|int16_t
+|Any|signed 8 to signed 32|int8_t|int32_t
+|Any|signed 8 to floating-point|int8_t|float_t
+|Any|signed 16 to bool|int16_t|bool_t
+|Any|signed 16 to signed 8|int16_t|int8_t
+|Any|signed 16 to signed 32|int16_t|int32_t
+|Any|signed 16 to floating-point|int16_t|float_t
+|Any|signed 32 to bool|int32_t|bool_t
+|Any|signed 32 to signed 8|int32_t|int8_t
+|Any|signed 32 to signed 16|int32_t|int16_t
+|Any|signed 32 to floating-point|int32_t|float_t
+|Any|floating-point to signed 8|float_t|int8_t
+|Any|floating-point to signed 16|float_t|int16_t
+|Any|floating-point to signed 32|float_t|int32_t
 |===
 
 ==== RESCALE
@@ -93,26 +93,26 @@ Rescale quantized values into a new domain. This function scales by factor: mult
 |Attribute|out_t|output_zp|-|Output tensor zero point
 |Input (MT profile) Attribute (BI/MI profiles)|mul_t|multiplier[NC]|-|Scaling multiplier array
 |Input (MT profile) Attribute (BI/MI profiles)|uint6_t|shift[NC] |-|Scaling shift array
-|Input (MT profile) Attribute (BI/MI profiles)|bool|scale32|-|if (scale32) mul_t=int32_t else mul_t=int16_t
-|Attribute|bool|double_round|-|Select double round mode
-|Attribute|bool|per_channel|-|if (per_channel) NC=shape[dims-1] else NC=1
+|Input (MT profile) Attribute (BI/MI profiles)|bool_t|scale32|-|if (scale32) mul_t=int32_t else mul_t=int16_t
+|Attribute|bool_t|double_round|-|Select double round mode
+|Attribute|bool_t|per_channel|-|if (per_channel) NC=shape[dims-1] else NC=1
 |===
 
 *Operation Function:*
 
 [source,c]
 ....
-for_each (index in shape) {
-    assert(in_t == int8  || in_t == uint8  || input_zp == 0);
-    assert(out_t == int8 || out_t == uint8 || output_zp == 0);
-    assert((scale32 && in_t!=int48_t) || (!scale32 && !double_round));
-    int48_t value = tensor_read<in_t>(input, shape, index, input_zp);
+for_each(index in shape) {
+    assert(in_t == int8_t  || in_t == uint8_t  || input_zp == 0);
+    assert(out_t == int8_t || out_t == uint8_t || output_zp == 0);
+    assert((scale32 && in_t != int48_t_t) || (!scale32 && !double_round));
+    int48_t_t value = tensor_read<in_t>(input, shape, index, input_zp);
     int c = (per_channel) ? index[dims-1] : 0;
     int32_t result = (scale32) ?
-      apply_scale_32(value, multiplier[c], shift[c], double_round) :
-      apply_scale_16(value, multiplier[c], shift[c]);
-    result = apply_clip(result + output_zp, minimum<out_t>, maximum<out_t>)
-    tensor_write<out_t>(output, shape, index, result)
+        apply_scale_32(value, multiplier[c], shift[c], double_round) :
+        apply_scale_16(value, multiplier[c], shift[c]);
+    result = apply_clip<out_t>(result + output_zp, minimum<out_t>, maximum<out_t>);
+    tensor_write<out_t>(output, shape, index, result);
 }
 ....
 
@@ -121,18 +121,18 @@ for_each (index in shape) {
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 8 to signed 8|int8|int8
-|Any|signed 8 to signed 16|int8|int16
-|Any|signed 8 to signed 32|int8|int32
-|Any|signed 16 to signed 8|int16|int8
-|Any|signed 16 to signed 16|int16|int16
-|Any|signed 16 to signed 32|int16|int32
-|Any|signed 32 to signed 8|int32|int8
-|Any|signed 32 to signed 16|int32|int16
-|Any|signed 32 to signed 32|int32|int32
-|Any|signed 48 to signed 8|int48|int8
-|Any|signed 48 to signed 16|int48|int16
-|Any|signed 48 to signed 32|int48|int32
-|Any|unsigned 8 to signed 8|uint8|int8
-|Any|signed 8 to unsigned 8|int8|uint8
+|Any|signed 8 to signed 8|int8_t|int8_t
+|Any|signed 8 to signed 16|int8_t|int16_t
+|Any|signed 8 to signed 32|int8_t|int32_t
+|Any|signed 16 to signed 8|int16_t|int8_t
+|Any|signed 16 to signed 16|int16_t|int16_t
+|Any|signed 16 to signed 32|int16_t|int32_t
+|Any|signed 32 to signed 8|int32_t|int8_t
+|Any|signed 32 to signed 16|int32_t|int16_t
+|Any|signed 32 to signed 32|int32_t|int32_t
+|Any|signed 48 to signed 8|int48_t_t|int8_t
+|Any|signed 48 to signed 16|int48_t_t|int16_t
+|Any|signed 48 to signed 32|int48_t_t|int32_t
+|Any|unsigned 8 to signed 8|uint8_t|int8_t
+|Any|signed 8 to unsigned 8|int8_t|uint8_t
 |===
author	Eric Kunze <eric.kunze@arm.com>	2021-02-17 19:23:39 -0800
committer	Eric Kunze <eric.kunze@arm.com>	2021-03-08 10:06:31 -0800
commit	1e9ba65f263a15f1f9cf9b9484047ea51237187a (patch)
tree	bf1eb0f43d24b207612e6e8a87799a211ba155a4
parent	54ff87d31637c97958ac49e40312e9b6de0a8f1a (diff)
download	specification-1e9ba65f263a15f1f9cf9b9484047ea51237187a.tar.gz