From 3170439f3938d007e58998d61eed98560c3f026c Mon Sep 17 00:00:00 2001
From: Eric Kunze <eric.kunze@arm.com>
Date: Mon, 25 Oct 2021 16:04:20 -0700
Subject: Remove zp subtraction from tensor_read pseudocode

Operators which use the zero-point functionalty for 8-bit integer
processing are updated to do the zero-point subtract in their
pseudocode.

Note that the PAD operator no longer takes a zero point argument,
and instead requires callers to account for the zero point in the
pad_const argument.

Change-Id: I3bca1cae85aa2093000c420f0433633c347a29de
---
 chapters/data_layout.adoc     | 19 +++++++++---------
 chapters/ewise_unary.adoc     |  9 +++++----
 chapters/introduction.adoc    |  8 ++------
 chapters/tensor_ops.adoc      | 45 ++++++++++++++++++++++++++++---------------
 chapters/type_conversion.adoc |  3 ++-
 5 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/chapters/data_layout.adoc b/chapters/data_layout.adoc
index 205ccaf..54221f6 100644
--- a/chapters/data_layout.adoc
+++ b/chapters/data_layout.adoc
@@ -68,6 +68,7 @@ for_each(index1 in shape) {
 
 Pads a tensor along the borders of each dimension with a supplied value.
 Returns a new tensor with the padding included.
+The pad_const value includes the zero point if the tensor uses a zero point.
 
 *Arguments:*
 
@@ -77,7 +78,6 @@ Returns a new tensor with the padding included.
 |Input|in_t*|input1|shape1|Input tensor
 |Attribute|int|padding|[rank(input1),2]|Amount of padding to be done
 |Attribute|in_t|pad_const|-|Constant value to be used as padding
-|Attribute|in_t|input1_zp|-|Input tensor zero point. Must be zero for non-int8 types.
 |Output|in_t*|output|shape|Output tensor of same type as the input tensor
 |===
 
@@ -85,7 +85,6 @@ Returns a new tensor with the padding included.
 
 [source,c++]
 ----
-ERROR_IF(in_t != int8_t  && input1_zp != 0); // Zero point only allowed for int8_t
 // Padding sizes must be >= 0.
 for_each(pad_size in padding) {
     ERROR_IF(pad_size < 0);
@@ -99,21 +98,21 @@ for_each(index in shape) {
             is_pad = true;
         }
     }
-    acc_t value = is_pad ? pad_const : tensor_read<in_t>(input1, shape1, index1, input1_zp);
-    tensor_write<in_t>(output, shape, index, value + input1_zp);
+    in_t value = is_pad ? pad_const : tensor_read<in_t>(input1, shape1, index1);
+    tensor_write<in_t>(output, shape, index, value);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|acc_t
+|Profile|Mode|in_t
 
-|Any|Boolean|bool_t|bool_t
-|Any|signed 8|int8_t|int16_t
-|Any|signed 16|int16_t|int16_t
-|Any|signed 32|int32_t|int32_t
-|MI, MT|floating-point|float_t|float_t
+|Any|Boolean|bool_t
+|Any|signed 8|int8_t
+|Any|signed 16|int16_t
+|Any|signed 32|int32_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== RESHAPE
diff --git a/chapters/ewise_unary.adoc b/chapters/ewise_unary.adoc
index 2dc01df..e2b754a 100644
--- a/chapters/ewise_unary.adoc
+++ b/chapters/ewise_unary.adoc
@@ -247,10 +247,11 @@ Elementwise negation operation
 ERROR_IF(in_t != int8_t && input1_zp != 0) // Zero point only for int8_t
 ERROR_IF(in_t != int8_t && output_zp != 0) // Zero point only for int8_t
 for_each(index in shape) {
-    acc_t acc = tensor_read<in_t>(input1, shape, index, input1_zp);
-    acc = apply_sub<acc_t>(0, acc);
-    in_t value = (in_t)apply_clip<acc_t>(acc + output_zp, minimum<in_t>, maximum<in_t>);
-    tensor_write<in_t>(output, shape, index, value);
+    in_t value1 = tensor_read<in_t>(input1, shape, index);
+    acc_t value = (acc_t)value1 - input1_zp;
+    value = apply_sub<acc_t>(0, value);
+    in_t result = (in_t)apply_clip<acc_t>(value + output_zp, minimum<in_t>, maximum<in_t>);
+    tensor_write<in_t>(output, shape, index, result);
 }
 ----
 
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index 93276f1..855be3d 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -253,14 +253,10 @@ The following pseudocode represents the operations that will happen to data elem
 tensor_read reads a single data value out of the given tensor.
 The shape argument contains the shape of the tensor.
 Index is the coordinates within the tensor of the value to be read.
-zero_point is the zero point value to be added for int8 values.
-If in_t is 8-bit then out_t=int16_t to account for the zero_point subtraction.
-Otherwise out_t is the same as in_t.
 
 [source,c++]
 ----
-out_t tensor_read<in_t>(in_t *address, dim_t shape, dim_t index, in_t zero_point=0) {
-    ERROR_IF(in_t != int8_t && zero_point != 0);
+in_t tensor_read<in_t>(in_t *address, dim_t shape, dim_t index) {
     // Ensure this is a proper tensor with each dimension having size >= 1
     for_each(dimension_size in shape) {
         REQUIRE(dimension_size >= 1);
@@ -270,7 +266,7 @@ out_t tensor_read<in_t>(in_t *address, dim_t shape, dim_t index, in_t zero_point
         REQUIRE(index[i] >= 0 && index[i] < shape[i]);
         offset = offset * shape[i] + index[i];
     }
-    return address[offset] - zero_point;
+    return address[offset];
 }
 ----
 
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index ad4d75d..d7ced25 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -42,14 +42,14 @@ ERROR_IF(flatten(left_shape, right_shape) != shape);
 for_each(left_index in left_shape) {
     for_each(right_index in right_shape) {
         in_t max_value = minimum_value<in_t>;
-        int32_t max_index = 0;
+        out_t max_index = 0;
         for (i = 0; i < shape[axis]; i++) {
             index = flatten(left_index, [i], right_index);
             in_t value = tensor_read<in_t>(input, shape1, index);
             if (value > max_value) { max_value = value; max_index = i; }
         }
         index = flatten(left_index, right_index);
-        tensor_write<int32_t>(output, shape, index, max_index);
+        tensor_write<out_t>(output, shape, index, max_index);
     }
 }
 ----
@@ -114,11 +114,12 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
         // average, padding does not count
         if (0 <= y < IH and 0 <= x < IW) {
             count++;
-            acc_t value = tensor_read<in_t>(input, [N,IH,IW,C], [n,y,x,c], input_zp);
+            acc_t value = tensor_read<in_t>(input, [N,IH,IW,C], [n,y,x,c]);
+            value = value - input_zp;
             acc = apply_add<acc_t>(acc, value);
         }
     }
-    if (is_float(out_t)) {
+    if (is_float(in_t)) {
         output_val = acc / (float)count;
     } else {
         scale_t scale = reciprocal_scale(count);
@@ -176,8 +177,10 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            acc_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp);
-            acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+            acc_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic]);
+            acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
+            value  = value - input_zp;
+            weight = weight - weight_zp;
             acc = apply_add<acc_t>(acc, value * weight);
         }
     }
@@ -237,8 +240,10 @@ for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
         if (0 <= x < IW && 0 <= y < IH && 0 <= d <= ID) {
-            acc_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp);
-            acc_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp);
+            acc_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic]);
+            acc_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic]);
+            value  = value - input_zp;
+            weight = weight - weight_zp;
             acc = apply_add<acc_t>(acc, value * weight);
         }
     }
@@ -297,8 +302,10 @@ for_each(0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
         y = iy + ky * dilation_y;
         x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            acc_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp);
-            acc_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp);
+            acc_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c]);
+            acc_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m]);
+            value  = value - input_zp;
+            weight = weight - weight_zp;
             acc = apply_add<acc_t>(acc, value * weight);
         }
     }
@@ -344,8 +351,10 @@ ERROR_IF(weight_t != int8_t && weight_zp != 0);
 for_each(0 <= n < N, 0 <= oc < OC) {
     acc_t acc = 0;
     for_each(0 <= ic < IC) {
-        acc_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp);
-        acc_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp);
+        acc_t value  = tensor_read<in_t>(input, [N,IC], [n,ic]);
+        acc_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic]);
+        value  = value - input_zp;
+        weight = weight - weight_zp;
         acc = apply_add<acc_t>(acc, value * weight);
     }
     acc = apply_add<acc_t>(acc, bias[oc]);
@@ -387,8 +396,10 @@ ERROR_IF(in_t != int8_t && (A_zp != 0 || B_zp != 0)); // Zero point only for int
 for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
     acc_t acc = 0;
     for_each(0 <= c < C) {
-        acc_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp);
-        acc_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp);
+        acc_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c]);
+        acc_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w]);
+        value1 = value1 - A_zp;
+        value2 = value2 - B_zp;
         acc = apply_add<acc_t>(acc, value1 * value2);
     }
     tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc);
@@ -499,8 +510,10 @@ for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
     ox = ix * stride_x - out_pad_left + kx;
     if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
         acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
-        acc_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp);
-        acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]);
+        acc_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
+        value = value - input_zp;
+        weight = weight - weight_zp;
         acc = apply_add<acc_t>(acc, value * weight);
         tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
     }
diff --git a/chapters/type_conversion.adoc b/chapters/type_conversion.adoc
index 77af54a..0d84bd6 100644
--- a/chapters/type_conversion.adoc
+++ b/chapters/type_conversion.adoc
@@ -100,7 +100,8 @@ for_each(index in shape) {
     ERROR_IF(out_t != int8_t && out_t != uint8_t && output_zp != 0);
     ERROR_IF(scale32 && in_t == int48_t);
     ERROR_IF(!scale32 && double_round);
-    int48_t value = tensor_read<in_t>(input, shape, index, input_zp);
+    int48_t value = tensor_read<in_t>(input, shape, index);
+    value = value - input_zp;
     int c = (per_channel) ? index[dims-1] : 0;
     int32_t result = (scale32) ?
         apply_scale_32(value, multiplier[c], shift[c], double_round) :
-- 
cgit v1.2.1