From 82f19e2ad25bcbdde8e7f8b6bd6a6064a207fe36 Mon Sep 17 00:00:00 2001
From: Eric Kunze <eric.kunze@arm.com>
Date: Mon, 25 Oct 2021 16:13:22 -0700
Subject: Readability fixes for pseudocode

Avoid use of acc for variables when they are not convolution
accumulators.

Use argument types appropriately.
Add missing pseudocode for some MI operators

Change-Id: I9113f9228dbcafb85206bcc39310e9599cb12c08
---
 chapters/comparison.adoc    | 26 +++++++-------
 chapters/ewise_binary.adoc  | 88 ++++++++++++++++++++++++++-------------------
 chapters/ewise_ternary.adoc | 16 ++++-----
 chapters/ewise_unary.adoc   | 79 ++++++++++++++++++++++++++++++++++++----
 chapters/image.adoc         |  4 +--
 chapters/pseudocode.adoc    | 74 ++++++++++++++++++++++++--------------
 chapters/reduction.adoc     | 76 ++++++++++++++++++++-------------------
 chapters/tensor_ops.adoc    |  8 ++---
 8 files changed, 238 insertions(+), 133 deletions(-)

diff --git a/chapters/comparison.adoc b/chapters/comparison.adoc
index 43f0787..ad574fb 100644
--- a/chapters/comparison.adoc
+++ b/chapters/comparison.adoc
@@ -1,7 +1,7 @@
 //
 // This confidential and proprietary software may be used only as
 // authorised by a licensing agreement from ARM Limited
-// (C) COPYRIGHT 2020 ARM Limited
+// (C) COPYRIGHT 2020-2021 ARM Limited
 // ALL RIGHTS RESERVED
 // The entire notice above must be reproduced on all authorised
 // copies and copies may only be made to the extent permitted
@@ -30,10 +30,10 @@ Elementwise comparison operation
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    int32_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    int32_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    bool_t acc = (value1 == value2) ? True : False;
-    tensor_write<out_t>(output, shape, index, acc);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    out_t result = (value1 == value2) ? True : False;
+    tensor_write<out_t>(output, shape, index, result);
 }
 ----
 
@@ -67,10 +67,10 @@ Elementwise greater than comparison operation
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    int32_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    int32_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    bool_t acc = (value1 > value2) ? True : False;
-    tensor_write<out_t>(output, shape, index, acc);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    out_t result = (value1 > value2) ? True : False;
+    tensor_write<out_t>(output, shape, index, result);
 }
 ----
 
@@ -103,10 +103,10 @@ Elementwise comparison operation
 for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
-    int32_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    int32_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    bool_t acc = (value1 >= value2) ? True : False;
-    tensor_write<out_t>(output, shape, index, acc);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    out_t result = (value1 >= value2) ? True : False;
+    tensor_write<out_t>(output, shape, index, result);
 }
 ----
 
diff --git a/chapters/ewise_binary.adoc b/chapters/ewise_binary.adoc
index f44f7f5..4173aab 100644
--- a/chapters/ewise_binary.adoc
+++ b/chapters/ewise_binary.adoc
@@ -33,8 +33,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = apply_add<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = apply_add<in_t>(value1, value2);
+    tensor_write<in_t>(output, shape, index, result);
 ----
 
 *Supported Data Types:*
@@ -77,12 +77,12 @@ for_each(index in shape) {
             (in_t == int16_t && 0 <= value2 && value2 <= 15) ||
             (in_t == int8_t && 0 <= value2 && value2 <= 7));
 
-    in_t acc = value1 >> value2;
+    in_t result = value1 >> value2;
     if (round == true && value2 > 0 && (value1 >> (value2 - 1)) & 1 != 0) {
-        acc = acc + 1;
+        result = result + 1;
     }
-    acc = apply_clip<in_t>(acc, minimum<in_t>, maximum<in_t>)
-    tensor_write<in_t>(output, shape, index, acc)
+    result = apply_clip<in_t>(result, minimum<in_t>, maximum<in_t>);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -120,8 +120,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = value1 & value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 & value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -159,8 +159,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = value1 | value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 | value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -198,8 +198,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = value1 ^ value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 ^ value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -244,8 +244,8 @@ for_each(index in shape) {
     // This catches the case where we divide minimum<in_t> by -1
     // which is not representable in two's complement
     REQUIRE((int64_t)value1 / value2 <= maximum<in_t>);
-    in_t acc = value1 / value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 / value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -280,8 +280,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = value1 && value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 && value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -318,8 +318,8 @@ for_each(index in shape) {
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
-    in_t acc = value1 << value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 << value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -358,8 +358,8 @@ for_each(index in shape) {
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     REQUIRE(0 <= value2 && value2 <= 31);
-    in_t acc = (unsigned in_t)value1 >> value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = (in_t)((unsigned in_t)value1 >> value2);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -397,8 +397,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = value1 || value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 || value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -434,8 +434,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = value1 != value2;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = value1 != value2;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -471,8 +471,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = apply_max(value1, value2);
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = apply_max(value1, value2);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -509,8 +509,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = apply_min(value1, value2);
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = apply_min(value1, value2);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -548,12 +548,13 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    out_t result;
     if (in_t == int32_t && shift > 0) {
-        out_t acc = apply_scale_32(value1, value2, shift);
+        result = apply_scale_32(value1, value2, shift);
     } else {
-        out_t acc = value1 * value2;  // low 32-bits of result for int32_t
+        result = value1 * value2;  // low 32-bits of result for int32_t
     }
-    tensor_write<out_t>(output, shape, index, acc);
+    tensor_write<out_t>(output, shape, index, result);
 }
 ----
 
@@ -582,6 +583,20 @@ Axis of size 1 will be broadcast, as necessary. Rank of input tensors must match
 |Output|in_t*|output|shape|Output tensor of same type as the input tensors, with broadcast shape if necessary
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    index1 = apply_broadcast(shape, shape1, index);
+    index2 = apply_broadcast(shape, shape2, index);
+    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+    in_t result = apply_pow<in_t>(value1, value2);
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
@@ -614,8 +629,8 @@ for_each(index in shape) {
     index2 = apply_broadcast(shape, shape2, index);
     in_t value1 = tensor_read<in_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    in_t acc = apply_sub<in_t>(value1, value2);
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = apply_sub<in_t>(value1, value2);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -661,13 +676,14 @@ An int16_t to int16_t table lookup can be constructed in TOSA as follows:
 REQUIRE(length(table) == TABLE_SIZE);
 for_each(index in shape) {
     in_t value = tensor_read<in_t>(input, shape, index);
+    out_t result;
     if (in_t == int8_t) {
         // value is a signed int, convert to a 0 based index
-        out_t acc = table[value + 128];
+        result = table[value + 128];
     } else {
-        out_t acc = apply_lookup(table, value);
+        result = apply_lookup(table, value);
     }
-    tensor_write<out_t>(output, shape, index, acc);
+    tensor_write<out_t>(output, shape, index, result);
 }
 ----
 
diff --git a/chapters/ewise_ternary.adoc b/chapters/ewise_ternary.adoc
index ecf40d1..c6babbc 100644
--- a/chapters/ewise_ternary.adoc
+++ b/chapters/ewise_ternary.adoc
@@ -18,7 +18,7 @@ Elementwise select of the output based on a condition.
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|bool_t|input1|shape1|Input selector tensor
+|Input|cmp_t|input1|shape1|Input selector tensor
 |Input|in_t*|input2|shape2|Input value tensor if input1 is True
 |Input|in_t*|input3|shape3|Input value tensor if input1 is False
 |Output|in_t*|output|shape|Output tensor of same type as input2 and input3, with broadcast shape if necessary
@@ -32,22 +32,22 @@ for_each(index in shape) {
     index1 = apply_broadcast(shape, shape1, index);
     index2 = apply_broadcast(shape, shape2, index);
     index3 = apply_broadcast(shape, shpae3, index);
-    bool_t value1 = tensor_read<in_t>(input1, shape1, index1);
+    cmp_t value1 = tensor_read<cmp_t>(input1, shape1, index1);
     in_t value2 = tensor_read<in_t>(input2, shape2, index2);
     in_t value3 = tensor_read<in_t>(input3, shape3, index3);
-    in_t acc = 0;
-    if (value1 == True) {
-         acc = value2;
+    in_t result;
+    if (value1) {
+         result = value2;
     } else {
-         acc = value3;
+         result = value3;
     }
-    tensor_write<in_t>(output, shape, index, acc);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|bool_t|in_t
+|Profile|Mode|cmp_t|in_t
 
 |Any|Boolean|bool_t|bool_t
 |Any|signed 8|bool_t|int8_t
diff --git a/chapters/ewise_unary.adoc b/chapters/ewise_unary.adoc
index e2b754a..633b8ac 100644
--- a/chapters/ewise_unary.adoc
+++ b/chapters/ewise_unary.adoc
@@ -62,8 +62,8 @@ Elementwise bitwise NOT of input tensor.
 ----
 for_each(index in shape) {
     in_t value1 = tensor_read<in_t>(input1, shape, index);
-    in_t acc = ~value1;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = ~value1;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -90,6 +90,17 @@ Elementwise ceiling operation
 |Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_t result = apply_ceil<in_t>(value1);
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
@@ -116,10 +127,9 @@ Elementwise count leading zeros operation
 [source,c++]
 ----
 for_each(index in shape) {
-    in_t acc = 0;
     in_t value1 = tensor_read<in_t>(input1, shape, index);
-    acc = count_leading_zeros(value1);
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = count_leading_zeros(value1);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -143,6 +153,17 @@ Elementwise e to the x operation
 |Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_t result = apply_exp<in_t>(value1);
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
@@ -164,6 +185,17 @@ Elementwise floor operation
 |Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_t result = apply_floor<in_t>(value1);
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
@@ -185,6 +217,17 @@ Elementwise natural logarithm operation
 |Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    in_t result = apply_log<in_t>(value1);
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
@@ -212,8 +255,8 @@ Elementwise logical NOT of input.
 ----
 for_each(index in shape) {
     in_t value1 = tensor_read<in_t>(input1, shape1, index);
-    in_t acc = !value1;
-    tensor_write<in_t>(output, shape, index, acc);
+    in_t result = !value1;
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
@@ -279,6 +322,17 @@ Elementwise reciprocal operation. For integer operation, a TABLE should be used
 |Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape1, index);
+    in_t result = 1.0 / value1;
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
@@ -300,6 +354,17 @@ Elementwise reciprocal square root operation. For integer operation, a TABLE sho
 |Output|in_t*|output|shape|Output tensor of same type, size as the input tensor
 |===
 
+*Operation Function:*
+
+[source,c++]
+----
+for_each(index in shape) {
+    in_t value1 = tensor_read<in_t>(input1, shape1, index);
+    in_t result = 1.0 / apply_sqrt<in_t>(value1);
+    tensor_write<in_t>(output, shape, index, result);
+}
+----
+
 *Supported Data Types:*
 
 |===
diff --git a/chapters/image.adoc b/chapters/image.adoc
index 2491ea5..7476d8a 100644
--- a/chapters/image.adoc
+++ b/chapters/image.adoc
@@ -74,8 +74,8 @@ for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= c < C) {
     y = oy * stride_y + offset_y;
     x = ox * stride_x + offset_x;
     if (resize_t == float_t) {
-        iy = (int)floor(y); dy = y - (float_t)iy;
-        ix = (int)floor(x); dx = x - (float_t)ix;
+        iy = (int)apply_floor(y); dy = y - (float_t)iy;
+        ix = (int)apply_floor(x); dx = x - (float_t)ix;
     } else {
         iy = y >> shift; dy = y - (iy<<shift);
         ix = x >> shift; dx = x - (ix<<shift);
diff --git a/chapters/pseudocode.adoc b/chapters/pseudocode.adoc
index 16e7e67..d5f05db 100644
--- a/chapters/pseudocode.adoc
+++ b/chapters/pseudocode.adoc
@@ -54,43 +54,61 @@ void ERROR_IF(condition) {
 
 This section contains general pseudocode utility functions used throughout the specification.
 
-The following functions provide basic arithmetic while defining requirements such that values stay in the valid range.
+The following functions provide arithmetic while defining requirements such that values stay in the valid range.
 
 [source,c++]
 ----
-acc_t apply_add<acc_t>(acc_t a, acc_t b) {
-    if (acc_t == float_t) return a + b;
+in_t apply_add<in_t>(in_t a, in_t b) {
+    if (<in_t> == float_t) return a + b;
     int64_t c = (int64_t)a + (int64_t)b;
-    REQUIRE(c >= minimum<acc_t> && c <= maximum<acc_t>);
-    return (acc_t)c;
+    REQUIRE(c >= minimum<in_t> && c <= maximum<in_t>);
+    return (in_t)c;
 }
 
-acc_t apply_sub<acc_t>(acc_t a, acc_t b) {
-    if (acc_t == float_t) return a - b;
-    int64_t c = (int64_t)a - (int64_t)b;
-    REQUIRE(c >= minimum<acc_t> && c <= maximum<acc_t>);
-    return (acc_t)c;
+in_t apply_ceil<in_t>(in_t input) {
+    return input value rounded up to nearest integer
 }
-----
 
-The following functions are used in the pseudocode to take maximum,
-minimum, clip values to a range, or count leading zeros.
-[[count_leading_zeros]]
-[source,c++]
-----
-<type> apply_max<type>(<type> a, <type> b) {
+in_t apply_clip<in_t>(in_t value, in_t min_val, in_t max_val) {
+    REQUIRE(min_val <= max_val);
+    value = apply_max(value, min_val);
+    value = apply_min(value, max_val);
+    return value;
+}
+
+in_t apply_exp<in_t>(in_t input) {
+    return e to the power input
+}
+
+in_t apply_floor<in_t>(in_t input) {
+    return input value rounded down to nearest integer
+}
+
+in_t apply_log<in_t>(in_t input) {
+    return the natural logarithm of input
+}
+
+in_t apply_max<in_t>(in_t a, in_t b) {
     if (a >= b) return a; else return b;
 }
 
-<type> apply_min<type>(<type> a, <type> b) {
+in_t apply_min<in_t>(in_t a, in_t b) {
     if (a < b) return a; else return b;
 }
 
-<type> apply_clip<type>(<type> value, <type> min_val, <type> max_val) {
-    REQUIRE(min_val <= max_val);
-    value = apply_max(value, min_val);
-    value = apply_min(value, max_val);
-    return value;
+in_t apply_pow<in_t>(in_t a, in_t b) {
+    return a ** b; // a raised to the power b
+}
+
+in_t apply_sqrt<in_t>(in_t input) {
+    return the square root of input
+}
+
+in_t apply_sub<in_t>(in_t a, in_t b) {
+    if (in_t == float_t) return a - b;
+    int64_t c = (int64_t)a - (int64_t)b;
+    REQUIRE(c >= minimum<in_t> && c <= maximum<in_t>);
+    return (in_t)c;
 }
 
 int32_t count_leading_zeros(int32_t a) {
@@ -146,15 +164,17 @@ Generic helper functions used to keep the pseudocode concise.
 
 [source,c++]
 ----
+
+int idiv(int input1, int input2) {
+    return input1 / input2; // Integer divide that truncates towards zero
+}
+
 int length(in_t input)
     return number of elements in input list
 
-int floor(in_t input)
-    return input value rounded down to nearest integer
-
 int rank(in_t input)
     return rank of an input tensor
 
 int sum(in_t input[])
     return the sum of values of an input list
-----
\ No newline at end of file
+----
diff --git a/chapters/reduction.adoc b/chapters/reduction.adoc
index b687896..11db960 100644
--- a/chapters/reduction.adoc
+++ b/chapters/reduction.adoc
@@ -29,16 +29,18 @@ Reduce a tensor along the given axis with a logical AND operation
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
+
+// Initialize output state to true
 for_each(index in shape) {
     tensor_write<in_t>(output, shape, index, true);
 }
 for_each(index in shape1) {
-    tmp_index = index;
-    tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, shape1, index);
-    acc   = tensor_read<in_t>(output, shape, tmp_index);
-    acc   = acc && value;
-    tensor_write<in_t>(output, shape, tmp_index, acc);
+    out_index = index;
+    out_index[axis] = 0;
+    in_t value = tensor_read<in_t>(input, shape1, index);
+    in_t state = tensor_read<in_t>(output, shape, out_index);
+    state      = state && value;
+    tensor_write<in_t>(output, shape, out_index, state);
 }
 ----
 
@@ -70,16 +72,18 @@ Reduce a tensor along the given axis with a logical OR operation
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
+
+// Initialize output state to false
 for_each(index in shape) {
     tensor_write<in_t>(output, shape, index, false);
 }
 for_each(index in shape1) {
-    tmp_index = index;
-    tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, shape1, index);
-    acc   = tensor_read<in_t>(output, shape, tmp_index);
-    acc   = acc || value;
-    tensor_write<in_t>(output, shape, tmp_index, acc);
+    out_index = index;
+    out_index[axis] = 0;
+    in_t value = tensor_read<in_t>(input, shape1, index);
+    in_t state = tensor_read<in_t>(output, shape, out_index);
+    state      = state || value;
+    tensor_write<in_t>(output, shape, out_index, state);
 }
 ----
 
@@ -115,12 +119,12 @@ for_each(index in shape) {
     tensor_write<in_t>(output, shape, index, minimum<in_t>);
 }
 for_each(index in shape1) {
-    tmp_index = index;
-    tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, shape1, index);
-    acc   = tensor_read<in_t>(output, shape, tmp_index);
-    acc   = apply_max<in_t>(acc, value);
-    tensor_write<in_t>(output, shape, tmp_index, acc);
+    out_index = index;
+    out_index[axis] = 0;
+    in_t value = tensor_read<in_t>(input, shape1, index);
+    in_t state = tensor_read<in_t>(output, shape, out_index);
+    state      = apply_max<in_t>(state, value);
+    tensor_write<in_t>(output, shape, out_index, state);
 }
 ----
 
@@ -158,12 +162,12 @@ for_each(index in shape) {
     tensor_write<in_t>(output, shape, index, maximum<in_t>);
 }
 for_each(index in shape1) {
-    tmp_index = index;
-    tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, shape1, index);
-    acc   = tensor_read<in_t>(output, shape, tmp_index);
-    acc   = apply_min<in_t>(acc, value);
-    tensor_write<in_t>(output, shape, tmp_index, acc);
+    out_index = index;
+    out_index[axis] = 0;
+    in_t value = tensor_read<in_t>(input, shape1, index);
+    in_t state = tensor_read<in_t>(output, shape, out_index);
+    state      = apply_min<in_t>(state, value);
+    tensor_write<in_t>(output, shape, out_index, state);
 }
 ----
 
@@ -202,12 +206,12 @@ for_each(index in shape) {
     tensor_write<in_t>(output, shape, index, 1.0);
 }
 for_each(index in shape1) {
-    tmp_index = index;
-    tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, shape1, index);
-    acc   = tensor_read<in_t>(output, shape, tmp_index);
-    acc   = acc * value;
-    tensor_write<in_t>(output, shape, tmp_index, acc);
+    out_index = index;
+    out_index[axis] = 0;
+    in_t value = tensor_read<in_t>(input, shape1, index);
+    in_t state = tensor_read<in_t>(output, shape, out_index);
+    state      = state * value;
+    tensor_write<in_t>(output, shape, out_index, state);
 }
 ----
 
@@ -243,12 +247,12 @@ for_each(index in shape) {
     tensor_write<in_t>(output, shape, index, 0);
 }
 for_each(index in shape1) {
-    tmp_index = index;
-    tmp_index[axis]=0;
-    value = tensor_read<in_t>(input, shape1, index);
-    acc   = tensor_read<in_t>(output, shape, tmp_index);
-    acc   = apply_add<in_t>(acc, value);
-    tensor_write<in_t>(output, shape, tmp_index, acc);
+    out_index = index;
+    out_index[axis] = 0;
+    in_t value = tensor_read<in_t>(input, shape1, index);
+    in_t state = tensor_read<in_t>(output, shape, out_index);
+    state      = apply_add<in_t>(state, value);
+    tensor_write<in_t>(output, shape, out_index, state);
 }
 ----
 
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index d7ced25..cfab5ba 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -98,8 +98,8 @@ ERROR_IF(pad_right >= kernel_x || pad_left >= kernel_x);
 ERROR_IF(pad_top >= kernel_y || pad_bottom >= kernel_y);
 // Output shape must match expected shape given the input shape
 // and arguments provided
-ERROR_IF(H != floor((IH + pad_top + pad_bottom + stride_y - kernel_y) / stride_y))
-ERROR_IF(W != floor((IW + pad_left + pad_right + stride_x - kernel_x) / stride_x))
+ERROR_IF(H != idiv((IH + pad_top + pad_bottom + stride_y - kernel_y), stride_y));
+ERROR_IF(W != idiv((IW + pad_left + pad_right + stride_x - kernel_x), stride_x));
 
 for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
     in_t output_val;
@@ -444,8 +444,8 @@ ERROR_IF(pad_right >= kernel_x || pad_left >= kernel_x);
 ERROR_IF(pad_top >= kernel_y || pad_bottom >= kernel_y);
 // Output shape must match expected shape given the input shape
 // and arguments provided
-ERROR_IF(H != floor((IH + pad_top + pad_bottom + stride_y - kernel_y) / stride_y))
-ERROR_IF(W != floor((IW + pad_left + pad_right + stride_x - kernel_x) / stride_x))
+ERROR_IF(H != idiv((IH + pad_top + pad_bottom + stride_y - kernel_y), stride_y));
+ERROR_IF(W != idiv((IW + pad_left + pad_right + stride_x - kernel_x), stride_x));
 
 for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
     in_t acc = minimum_value<in_t>;
-- 
cgit v1.2.1