1 files changed, 57 insertions, 80 deletions
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index b9d54c1..656af85 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -1,7 +1,7 @@
 //
 // This confidential and proprietary software may be used only as
 // authorised by a licensing agreement from ARM Limited
-// (C) COPYRIGHT 2020-2023 ARM Limited
+// (C) COPYRIGHT 2020-2022 ARM Limited
 // ALL RIGHTS RESERVED
 // The entire notice above must be reproduced on all authorised
 // copies and copies may only be made to the extent permitted
@@ -31,15 +31,12 @@ if (axis == rank(shape1)-1) {
 ERROR_IF(flatten(left_shape, right_shape) != shape);
 for_each(left_index in left_shape) {
     for_each(right_index in right_shape) {
-        in_t max_value = minimum_s<in_t>;
+        in_t max_value = minimum_value<in_t>;
         out_t max_index = 0;
         for (i = 0; i < shape[axis]; i++) {
             dim_t index = flatten(left_index, [i], right_index);
             in_t value = tensor_read<in_t>(input, shape1, index);
-            if (apply_max_s<in_t>(value, max_value) != max_value) {
-                max_value = value;
-                max_index = i;
-            }
+            if (value > max_value) { max_value = value; max_index = i; }
         }
         dim_t index = flatten(left_index, right_index);
         tensor_write<out_t>(output, shape, index, max_index);
@@ -57,8 +54,8 @@ include::{generated}/operators/AVG_POOL2D.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_out_t != i8_t && input_zp != 0); // Zero point only for int8_t
-ERROR_IF(in_out_t != i8_t && output_zp != 0); // Zero point only for int8_t
+ERROR_IF(in_out_t != int8_t && input_zp != 0); // Zero point only for int8_t
+ERROR_IF(in_out_t != int8_t && output_zp != 0); // Zero point only for int8_t
 ERROR_IF(kernel_y < 1 || kernel_x < 1); // kernel size must be >= 1
 ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0);
@@ -82,19 +79,17 @@ for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW, 0 <= c < C ) {
         // average, padding does not count
         if (0 <= y < IH and 0 <= x < IW) {
             count++;
-            acc_t value = sign_extend<acc_t>(tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]));
-            value = apply_sub_s<acc_t>(value, sign_extend<acc_t>(input_zp));
-            acc = apply_add_s<acc_t>(acc, value);
+            acc_t value = tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]);
+            value = value - input_zp;
+            acc = apply_add<acc_t>(acc, value);
         }
     }
     if (is_float(in_out_t)) {
-        output_val = acc / static_cast<in_out_t>(count);
+        output_val = acc / (float)count;
     } else {
         scale_t scale = reciprocal_scale(count);
         acc = apply_scale_32(acc, scale.multiplier, scale.shift, false);
-        acc = apply_add_s<acc_t>(acc, sign_extend<acc_t>(output_zp));
-        acc = apply_clip_s<acc_t>(acc, minimum_s<in_out_t>, maximum_s<in_out_t>);
-        output_val = static_cast<in_out_t>(acc);
+        output_val = (in_out_t)apply_clip<acc_t>(acc + output_zp, minimum<in_out_t>, maximum<in_out_t>)
     }
     tensor_write<in_out_t>(output, [N,OH,OW,C], [n,oy,ox,c], output_val);
 }
@@ -108,14 +103,13 @@ include::{generated}/operators/CONV2D.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_t != i8_t && input_zp != 0); // Zero point only for int8_t
+ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t
 ERROR_IF(weight_t != int8_t && weight_zp != 0);
 ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0);
 ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(dilation_y < 1 || dilation_x < 1);
 ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1);
 ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1);
-ERROR_IF(BC != OC && BC != 1);
 
 for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= oc < OC) {
     out_t acc = 0;
@@ -125,18 +119,14 @@ for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= oc < OC) {
         index_t y = iy + ky * dilation_y;
         index_t x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            out_t value  = static_cast<out_t>(tensor_read<in_t>(input,
-                                                                [N,IH,IW,IC],
-                                                                [n,y,x,ic]));
-            out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight,
-                                                                   [OC,KH,KW,IC],
-                                                                   [oc,ky,kx,ic]));
-            value  = apply_sub_s<out_t>(value, static_cast<out_t>(input_zp));
-            weight = apply_sub_s<out_t>(weight, static_cast<out_t>(weight_zp));
-            acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
+            out_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic]);
+            out_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
+            value  = value - input_zp;
+            weight = weight - weight_zp;
+            acc = apply_add<out_t>(acc, value * weight);
         }
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : oc]);
+    acc = apply_add<out_t>(acc, bias[oc]);
     tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
 }
 ----
@@ -149,15 +139,14 @@ include::{generated}/operators/CONV3D.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_t != i8_t && input_zp != 0); // Zero point only for int8_t
-ERROR_IF(weight_t != i8_t && weight_zp != 0);
+ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t
+ERROR_IF(weight_t != int8_t && weight_zp != 0);
 ERROR_IF(pad_d0 < 0 || pad_d1 < 0 || pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0);
 ERROR_IF(stride_d < 1 || stride_y < 1 || stride_x < 1);
 ERROR_IF(dilation_d < 1 || dilation_y < 1 || dilation_x < 1);
 ERROR_IF(OD != idiv_check(ID - 1 + pad_d0 + pad_d1      - (KD - 1) * dilation_d, stride_d) + 1);
 ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1);
 ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1);
-ERROR_IF(BC != OC && BC != 1);
 
 for_each(0 <= n < N, 0 <= od < OD, 0 <= oy < OH, 0 <= ox < OW; 0 <= oc < OC) {
     out_t acc = 0;
@@ -169,18 +158,14 @@ for_each(0 <= n < N, 0 <= od < OD, 0 <= oy < OH, 0 <= ox < OW; 0 <= oc < OC) {
         index_t y = iy + ky * dilation_y;
         index_t x = ix + kx * dilation_x;
         if (0 <= x < IW && 0 <= y < IH && 0 <= d < ID) {
-            out_t value  = static_cast<out_t>(tensor_read<in_t>(input,
-                                                                [N,ID,IH,IW,IC],
-                                                                [n,d,y,x,ic]));
-            out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight,
-                                                                    [OC,KD,KH,KW,IC],
-                                                                    [oc,kd,ky,kx,ic]));
-            value  = apply_sub_s<out_t>(value, static_cast<out_t>(input_zp));
-            weight = apply_sub_s<out_t>(weight, static_cast<out_t>(weight_zp));
-            acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
+            out_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic]);
+            out_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic]);
+            value  = value - input_zp;
+            weight = weight - weight_zp;
+            acc = apply_add<out_t>(acc, value * weight);
         }
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : oc]);
+    acc = apply_add<out_t>(acc, bias[oc]);
     tensor_write<out_t>(output, [N,OD,OH,OW,OC], [n,od,oy,ox,oc], acc);
 }
 ----
@@ -193,14 +178,13 @@ include::{generated}/operators/DEPTHWISE_CONV2D.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_t != i8_t && input_zp != 0); // Zero point only for int8_t
-ERROR_IF(weight_t != i8_t && weight_zp != 0);
+ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t
+ERROR_IF(weight_t != int8_t && weight_zp != 0);
 ERROR_IF(pad_top < 0 || pad_bottom < 0 || pad_left < 0 || pad_right < 0);
 ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(dilation_y < 1 || dilation_x < 1);
 ERROR_IF(OH != idiv_check(IH - 1 + pad_top + pad_bottom - (KH - 1) * dilation_y, stride_y) + 1);
 ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x, stride_x) + 1);
-ERROR_IF(BC != C*M && BC != 1);
 
 for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= c < C, 0 <= m < M) {
     out_t acc = 0;
@@ -210,18 +194,14 @@ for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= c < C, 0 <= m < M) {
         index_t y = iy + ky * dilation_y;
         index_t x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            out_t value  = static_cast<out_t>(tensor_read<in_t>(input,
-                                                                [N,IH,IW,C],
-                                                                [n,y,x,c]));
-            out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight,
-                                                                    [KH,KW,C,M],
-                                                                    [ky,kx,c,m]));
-            value  = apply_sub_s<out_t>(value, static_cast<out_t>input_zp);
-            weight = apply_sub_s<out_t>(weight, static_cast<out_t>weight_zp);
-            acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
+            out_t value  = tensor_read<in_t>(input, [N,IH,IW,C], [n,y,x,c]);
+            out_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m]);
+            value  = value - input_zp;
+            weight = weight - weight_zp;
+            acc = apply_add<out_t>(acc, value * weight);
         }
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : (c * M) + m]);
+    acc = apply_add<out_t>(acc, bias[(c * M) + m]);
     tensor_write<out_t>(output, [N,OH,OW,C * M], [n,oy,ox,c * M + m], acc);
 }
 ----
@@ -279,20 +259,18 @@ include::{generated}/operators/FULLY_CONNECTED.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_t != i8_t && input_zp != 0); // Zero point only for int8_t
-ERROR_IF(weight_t != i8_t && weight_zp != 0);
-ERROR_IF(BC != OC && BC != 1);
-
+ERROR_IF(in_t != int8_t && input_zp != 0); // Zero point only for int8_t
+ERROR_IF(weight_t != int8_t && weight_zp != 0);
 for_each(0 <= n < N, 0 <= oc < OC) {
     out_t acc = 0;
     for_each(0 <= ic < IC) {
-        out_t value  = static_cast<out_t>(tensor_read<in_t>(input, [N,IC], [n,ic]));
-        out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight, [OC,IC], [oc,ic]));
-        value  = apply_sub_s<out_t>(value, static_cast<out_t>(input_zp));
-        weight = apply_sub_s<out_t>(weight, static_cast<out_t>(weight_zp));
-        acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
+        out_t value  = tensor_read<in_t>(input, [N,IC], [n,ic]);
+        out_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic]);
+        value  = value - input_zp;
+        weight = weight - weight_zp;
+        acc = apply_add<out_t>(acc, value * weight);
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : oc]);
+    acc = apply_add<out_t>(acc, bias[oc]);
     tensor_write<out_t>(output, [N,OC], [n,oc], acc);
 }
 ----
@@ -305,15 +283,15 @@ include::{generated}/operators/MATMUL.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_t != i8_t && (A_zp != 0 || B_zp != 0)); // Zero point only for int8_t
+ERROR_IF(in_t != int8_t && (A_zp != 0 || B_zp != 0)); // Zero point only for int8_t
 for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
     out_t acc = 0;
     for_each(0 <= c < C) {
-        out_t value1 = static_cast<out_t>(tensor_read<in_t>(A, [N,H,C], [n,h,c]));
-        out_t value2 = static_cast<out_t>(tensor_read<in_t>(B, [N,C,W], [n,c,w]));
-        value1 = apply_sub_s<out_t>(value1, static_cast<out_t>(A_zp));
-        value2 = apply_sub_s<out_t>(value2, static_cast<out_t>(B_zp));
-        acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value1 * value2));
+        out_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c]);
+        out_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w]);
+        value1 = value1 - A_zp;
+        value2 = value2 - B_zp;
+        acc = apply_add<out_t>(acc, value1 * value2);
     }
     tensor_write<out_t>(output, [N,H,W], [n,h,w], acc);
 }
@@ -346,7 +324,7 @@ for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
         index_t x = ix + kx;
         if (y >= 0 && y < IH && x >= 0 && x < IW) {
             in_out_t value = tensor_read<in_out_t>(input, [N,IH,IW,C], [n,y,x,c]);
-            acc = apply_max_s<in_out_t>(acc, value);
+            acc = apply_max(acc, value);
         }
     }
     tensor_write<in_out_t>(output, [N,OH,OW,C], [n,oy,ox,c], acc);
@@ -393,29 +371,28 @@ include::{generated}/operators/TRANSPOSE_CONV2D.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(in_t != i8_t  && input_zp != 0); // Zero point only allowed for int8_t
-ERROR_IF(weight_t != i8_t && weight_zp != 0);
+ERROR_IF(in_t != int8_t  && input_zp != 0); // Zero point only allowed for int8_t
+ERROR_IF(weight_t != int8_t && weight_zp != 0);
 ERROR_IF(out_pad_top <= -KH || out_pad_bottom <= -KH);
 ERROR_IF(out_pad_left <= -KW || out_pad_right <= -KW);
 ERROR_IF(stride_y < 1 || stride_x < 1);
 ERROR_IF(OH != (IH - 1) * stride_y + out_pad_top + out_pad_bottom + KH);
 ERROR_IF(OW != (IW - 1) * stride_x + out_pad_left + out_pad_right + KW);
-ERROR_IF(BC != OC && BC != 1);
 
 for_each(index in out_shape) {
-    tensor_write<out_t>(output, [N,OH,OW,OC], index, bias[(BC == 1) ? 0 : index[3]])
+    tensor_write<out_t>(output, [N,OH,OW,OC], index, bias[index[3]])
 }
 for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
           0 <= ic < IC, 0 <= ky < KH,  0 <= kx < KW) {
     index_t oy = iy * stride_y + out_pad_top + ky;
     index_t ox = ix * stride_x + out_pad_left + kx;
     if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
-        out_t acc = static_cast<out_t>(tensor_read<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]));
-        out_t value = static_cast<out_t>(tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]));
-        out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]));
-        value = apply_sub_s<out_t>(value, static_cast<out_t>(input_zp));
-        weight = apply_sub_s<out_t>(weight, static_cast<out_t>(weight_zp));
-        acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
+        out_t acc = tensor_read<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
+        out_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]);
+        out_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]);
+        value = value - input_zp;
+        weight = weight - weight_zp;
+        acc = apply_add<out_t>(acc, value * weight);
         tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
     }
 }