6 files changed, 62 insertions, 56 deletions
diff --git a/chapters/activation_funcs.adoc b/chapters/activation_funcs.adoc
index 5af849d..7a4a7b6 100644
--- a/chapters/activation_funcs.adoc
+++ b/chapters/activation_funcs.adoc
@@ -27,8 +27,8 @@ Clamp to an arbitrary minimum and maximum value. Note that the maximum and minim
 *Operation Function:*
 ....
 for_each(index in shape) {
-    value = tensor_read<in_t>(input, shape, index);
-    acc = apply_clip<in_t>(value, min_val, max_val);
+    acc_t value = tensor_read<in_t>(input, shape, index);
+    acc = (in_t)apply_clip<acc_t>(value, min_val, max_val);
     tensor_write<in_t>(output, shape, index, acc);
 }
 ....
@@ -36,11 +36,11 @@ for_each(index in shape) {
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_t|acc_t
 
-|Any|signed 8|int8_t
-|Any|signed 16|int16_t
-|MI, MT|floating-point|float_t
+|Any|signed 8|int8_t|int16_t
+|Any|signed 16|int16_t|int16_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== RELUN
@@ -63,8 +63,8 @@ ReLU with a scalar maximum value.
 ----
 for_each(index in shape) {
     in_t value = tensor_read<in_t>(input, shape, index);
-    acc = apply_clip<in_t>(value, 0, max_val);
-    tensor_write<in_t>(output, shape, index, acc);
+    value = apply_clip<in_t>(value, 0, max_val);
+    tensor_write<in_t>(output, shape, index, value);
 }
 ----
 
diff --git a/chapters/data_layout.adoc b/chapters/data_layout.adoc
index 67484cb..b5b5112 100644
--- a/chapters/data_layout.adoc
+++ b/chapters/data_layout.adoc
@@ -86,7 +86,7 @@ for_each(index in shape) {
     for(i = 0; i < rank(shape); i++) {
         index1[i] = index1[i] - padding[i,0];
     }
-    in_t value = tensor_read<in_t>(input1, shape1, index1, input1_zp, padding);
+    acc_t value = tensor_read<in_t>(input1, shape1, index1, input1_zp, padding);
     tensor_write<in_t>(output, shape, index, value + input1_zp);
 }
 ----
@@ -94,13 +94,13 @@ for_each(index in shape) {
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_t|acc_t
 
-|Any|Boolean|bool_t
-|Any|signed 8|int8_t
-|Any|signed 16|int16_t
-|Any|signed 32|int32_t
-|MI, MT|floating-point|float_t
+|Any|Boolean|bool_t|bool_t
+|Any|signed 8|int8_t|int16_t
+|Any|signed 16|int16_t|int16_t
+|Any|signed 32|int32_t|int32_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== RESHAPE
diff --git a/chapters/ewise_unary.adoc b/chapters/ewise_unary.adoc
index d852fa4..3784274 100644
--- a/chapters/ewise_unary.adoc
+++ b/chapters/ewise_unary.adoc
@@ -262,22 +262,22 @@ Elementwise negation operation
 assert(in_t == int8_t || input1_zp == 0) // Zero point only for int8_t
 assert(in_t == int8_t || output_zp == 0) // Zero point only for int8_t
 for_each(index in shape) {
-    in_t value1 = tensor_read<in_t>(input1, shape, index, input1_zp);
-    in_t acc = apply_sub<in_t>(0, value1);
-    acc = apply_clip<in_t>(acc + output_zp, minimum<in_t>, maximum<in_t>);
-    tensor_write<in_t>(output, shape, index, acc);
+    acc_t acc = tensor_read<in_t>(input1, shape, index, input1_zp);
+    acc = apply_sub<acc_t>(0, acc);
+    in_t value = (in_t)apply_clip<acc_t>(acc + output_zp, minimum<in_t>, maximum<in_t>);
+    tensor_write<in_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t
+|Profile|Mode|in_t|acc_t
 
-|Any|signed 8|int8_t
-|Any|signed 16|int16_t
-|Any|signed 32|int32_t
-|MI, MT|floating-point|float_t
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|Any|signed 32|int32_t|int32_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== RECIPROCAL
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index 3257ab0..7039e27 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -197,14 +197,20 @@ The padding array represents the before and after pair for each dimension.
 ....
 assert((pad ==  NULL) || size(pad) == 2 * size(shape));
 out_t tensor_read<in_t>(in_t *address, dim_t shape, dim_t index, in_t zero_point=0, dim_t pad=NULL) {
-  assert(in_t == int8_t || zero_point == 0)
-  unsigned offset = 0;
-  for (i = 0; i < rank(shape); i++)
-    if (index[i] < 0) { assert(pad && pad[2 * i] + index[i] >= 0); return 0; }
-    if (index[i] >= shape[i]) { assert(pad && index[i] < shape[i] + pad[2 * i + 1]); return 0; }
-    offset = offset * shape[i] + index[i]
-  }
-  return address[offset] - zero_point;
+    assert(in_t == int8_t || zero_point == 0)
+    unsigned offset = 0;
+    for (i = 0; i < rank(shape); i++) {
+        if (index[i] < 0) {
+            assert(pad && pad[2 * i] + index[i] >= 0);
+            return 0;
+        }
+        if (index[i] >= shape[i]) {
+            assert(pad && index[i] < shape[i] + pad[2 * i + 1]);
+            return 0;
+        }
+        offset = offset * shape[i] + index[i];
+    }
+    return address[offset] - zero_point;
 }
 ....
 
@@ -212,12 +218,12 @@ out_t tensor_read<in_t>(in_t *address, dim_t shape, dim_t index, in_t zero_point
 
 ....
 tensor_write<type>(<type> *address, dim_t shape, dim_t index, <type> value) {
-  unsigned offset = 0;
-  for (i = 0; i < rank(shape); i++)
-    assert (index[i] >= 0 && index[i] < shape[i]);
-    offset = offset * shape[i] + index[i];
-  }
-  address[offset] = value;
+    unsigned offset = 0;
+    for (i = 0; i < rank(shape); i++) {
+        assert (index[i] >= 0 && index[i] < shape[i]);
+        offset = offset * shape[i] + index[i];
+    }
+    address[offset] = value;
 }
 ....
 
@@ -346,7 +352,7 @@ All table lookups are based on the following reference lookup function that take
 ....
 int32_t apply_lookup(int16_t *table, int32_t value)
 {
-    int16_t clipped_value = apply_clip<int16_t>(value, -32768, +32767);
+    int16_t clipped_value = (int16_t)apply_clip<int32_t>(value, -32768, +32767);
     int32_t index = (clipped_value + 32768) >> 7;
     int32_t fraction = clipped_value & 0x7f;
     int16_t base = table[index];
@@ -364,7 +370,7 @@ void generate_lookup_table(int16_t *table, int32_t (*reference)(int32_t))
 {
     for (int i = -256; i <= 256; i++) {
         int32_t value = (*reference)(i);
-        table[i + 256] = apply_clip<int16_t>(value, -32768, +32767)
+        table[i + 256] = (int16_t)apply_clip<int32_t>(value, -32768, +32767)
     }
 }
 ....
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index 341f51d..b006c71 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -99,7 +99,7 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
     for_each(0 <= ky < kernel_y, 0 <= kx < kernel_x) {
         y = iy + ky;
         x = ix + kx;
-        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad);
+        acc_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad);
         acc = apply_add<acc_t>(acc, value);
         if (0 <= y < IH and 0 <= x < IW) count++
     }
@@ -108,7 +108,7 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
     } else {
         scale_t scale = reciprocal_scale(count);
         acc = apply_scale_32(acc, scale.multiplier, scale.shift, false);
-        output_val = apply_clip<in_t>(acc + output_zp, minimum<in_t>, maximum<in_t>)
+        output_val = (in_t)apply_clip<acc_t>(acc + output_zp, minimum<in_t>, maximum<in_t>)
     }
     tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], output_val);
 }
@@ -164,8 +164,8 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
     for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad);
-        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad);
+        acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
         acc = apply_add<acc_t>(acc, value * weight);
     }
     acc = apply_add<acc_t>(acc, bias[oc]);
@@ -227,8 +227,8 @@ for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
         d = id + kd * dilation_d;
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
-        in_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad);
-        weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp);
+        acc_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad);
+        acc_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp);
         acc = apply_add<acc_t>(acc, value * weight);
     }
     acc = apply_add<acc_t>(acc, bias[oc]);
@@ -289,8 +289,8 @@ for_each(0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
     for_each(0 <= ky < KH, 0 <= kx < KW) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
-        in_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad);
-        weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp);
+        acc_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad);
+        acc_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp);
         acc = apply_add<acc_t>(acc, value * weight);
     }
     acc = apply_add<acc_t>(acc, bias[(c * M) + m]);
@@ -342,8 +342,8 @@ assert(weight_t == int8_t || weight_zp == 0);
 for_each(0 <= n < N, 0 <= oc < OC) {
     acc_t acc = 0;
     for_each(0 <= ic < IC) {
-        in_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp);
-        weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp);
+        acc_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp);
+        acc_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp);
         acc = apply_add<acc_t>(acc, value * weight);
     }
     acc = apply_add<acc_t>(acc, bias[oc]);
@@ -392,8 +392,8 @@ assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)); // Zero point only for int8_
 for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
     acc_t acc = 0;
     for_each(0 <= c < C) {
-        in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp);
-        in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp);
+        acc_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp);
+        acc_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp);
         acc = apply_add<acc_t>(acc, value1 * value2);
     }
     tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc);
@@ -500,8 +500,8 @@ for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
     ox = ix * stride_x - out_pad_left + kx;
     if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
         acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
-        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp);
-        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp);
+        acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
         acc = apply_add<acc_t>(acc, value * weight);
         tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
     }
diff --git a/chapters/type_conversion.adoc b/chapters/type_conversion.adoc
index 8f9e255..6701297 100644
--- a/chapters/type_conversion.adoc
+++ b/chapters/type_conversion.adoc
@@ -106,12 +106,12 @@ for_each(index in shape) {
     assert(in_t == int8_t  || in_t == uint8_t  || input_zp == 0);
     assert(out_t == int8_t || out_t == uint8_t || output_zp == 0);
     assert((scale32 && in_t != int48_t_t) || (!scale32 && !double_round));
-    int48_t_t value = tensor_read<in_t>(input, shape, index, input_zp);
+    int48_t value = tensor_read<in_t>(input, shape, index, input_zp);
     int c = (per_channel) ? index[dims-1] : 0;
     int32_t result = (scale32) ?
         apply_scale_32(value, multiplier[c], shift[c], double_round) :
         apply_scale_16(value, multiplier[c], shift[c]);
-    result = apply_clip<out_t>(result + output_zp, minimum<out_t>, maximum<out_t>);
+    result = (out_t)apply_clip<int32_t>(result + output_zp, minimum<out_t>, maximum<out_t>);
     tensor_write<out_t>(output, shape, index, result);
 }
 ....