From 0afe61f88ce3d2f445c5f01ae5567cb1b0b7f303 Mon Sep 17 00:00:00 2001
From: Eric Kunze <eric.kunze@arm.com>
Date: Wed, 14 Feb 2024 16:33:31 -0800
Subject: Modify convolution operators to improve bias handling

Accumulator size moves to an enumerated attribute, out_t for floating-point changes
to be the size of the input. Bias for floating-point also becomes the bit width
of the input type.

Signed-off-by: Eric Kunze <eric.kunze@arm.com>
Change-Id: I7369417adbb1106ce34a1978e7f511a30272c318
---
 pseudocode/operators/CONV2D.tosac           | 17 ++++++++-------
 pseudocode/operators/CONV3D.tosac           |  5 +++--
 pseudocode/operators/DEPTHWISE_CONV2D.tosac |  5 +++--
 pseudocode/operators/TRANSPOSE_CONV2D.tosac | 34 ++++++++++++++++-------------
 4 files changed, 34 insertions(+), 27 deletions(-)

(limited to 'pseudocode')

diff --git a/pseudocode/operators/CONV2D.tosac b/pseudocode/operators/CONV2D.tosac
index fe61747..0ae0e81 100644
--- a/pseudocode/operators/CONV2D.tosac
+++ b/pseudocode/operators/CONV2D.tosac
@@ -17,24 +17,25 @@ ERROR_IF(OW != idiv_check(IW - 1 + pad_left + pad_right - (KW - 1) * dilation_x,
 ERROR_IF(BC != OC && BC != 1);
 
 for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW, 0 <= oc < OC) {
-    out_t acc = 0;
+    acc_t acc = 0;
     index_t iy = oy * stride_y - pad_top;
     index_t ix = ox * stride_x - pad_left;
     for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
         index_t y = iy + ky * dilation_y;
         index_t x = ix + kx * dilation_x;
         if (0 <= y < IH && 0 <= x < IW) {
-            out_t value  = static_cast<out_t>(tensor_read<in_t>(input,
+            acc_t value  = static_cast<out_t>(tensor_read<in_t>(input,
                                                                 [N,IH,IW,IC],
                                                                 [n,y,x,ic]));
-            out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight,
+            acc_t weight = static_cast<out_t>(tensor_read<weight_t>(weight,
                                                                    [OC,KH,KW,IC],
                                                                    [oc,ky,kx,ic]));
-            value  = apply_sub_s<out_t>(value, static_cast<out_t>(input_zp));
-            weight = apply_sub_s<out_t>(weight, static_cast<out_t>(weight_zp));
-            acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
+            value  = apply_sub_s<acc_t>(value, static_cast<out_t>(input_zp));
+            weight = apply_sub_s<acc_t>(weight, static_cast<out_t>(weight_zp));
+            acc = apply_add_s<acc_t>(acc, apply_mul_s<acc_t>(value, weight));
         }
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : oc]);
-    tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
+    out_t out = static_cast<out_t>(acc);
+    out = apply_add_s<out_t>(out, bias[(BC == 1) ? 0 : oc]);
+    tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], out);
 }
diff --git a/pseudocode/operators/CONV3D.tosac b/pseudocode/operators/CONV3D.tosac
index 7568564..e53b7eb 100644
--- a/pseudocode/operators/CONV3D.tosac
+++ b/pseudocode/operators/CONV3D.tosac
@@ -38,6 +38,7 @@ for_each(0 <= n < N, 0 <= od < OD, 0 <= oy < OH, 0 <= ox < OW, 0 <= oc < OC) {
             acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
         }
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : oc]);
-    tensor_write<out_t>(output, [N,OD,OH,OW,OC], [n,od,oy,ox,oc], acc);
+    out_t out = static_cast<out_t>(acc);
+    out = apply_add_s<out_t>(out, bias[(BC == 1) ? 0 : oc]);
+    tensor_write<out_t>(output, [N,OD,OH,OW,OC], [n,od,oy,ox,oc], out);
 }
diff --git a/pseudocode/operators/DEPTHWISE_CONV2D.tosac b/pseudocode/operators/DEPTHWISE_CONV2D.tosac
index a473375..419d2eb 100644
--- a/pseudocode/operators/DEPTHWISE_CONV2D.tosac
+++ b/pseudocode/operators/DEPTHWISE_CONV2D.tosac
@@ -35,6 +35,7 @@ for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW, 0 <= c < C, 0 <= m < M) {
             acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
         }
     }
-    acc = apply_add_s<out_t>(acc, bias[(BC == 1) ? 0 : (c * M) + m]);
-    tensor_write<out_t>(output, [N,OH,OW,C * M], [n,oy,ox,c * M + m], acc);
+    out_t out = static_cast<out_t>(acc);
+    out = apply_add_s<out_t>(out, bias[(BC == 1) ? 0 : (c * M) + m]);
+    tensor_write<out_t>(output, [N,OH,OW,C * M], [n,oy,ox,c * M + m], out);
 }
diff --git a/pseudocode/operators/TRANSPOSE_CONV2D.tosac b/pseudocode/operators/TRANSPOSE_CONV2D.tosac
index ab61348..6713b30 100644
--- a/pseudocode/operators/TRANSPOSE_CONV2D.tosac
+++ b/pseudocode/operators/TRANSPOSE_CONV2D.tosac
@@ -16,20 +16,24 @@ ERROR_IF(OH != (IH - 1) * stride_y + out_pad_top + out_pad_bottom + KH);
 ERROR_IF(OW != (IW - 1) * stride_x + out_pad_left + out_pad_right + KW);
 ERROR_IF(BC != OC && BC != 1);
 
-for_each(index in [N, OH, OW, OC]) {
-    tensor_write<out_t>(output, [N,OH,OW,OC], index, bias[(BC == 1) ? 0 : index[3]]);
-}
-for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
-          0 <= ic < IC, 0 <= ky < KH,  0 <= kx < KW) {
-    index_t oy = iy * stride_y + out_pad_top + ky;
-    index_t ox = ix * stride_x + out_pad_left + kx;
-    if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
-        out_t acc = static_cast<out_t>(tensor_read<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]));
-        out_t value = static_cast<out_t>(tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic]));
-        out_t weight = static_cast<out_t>(tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic]));
-        value = apply_sub_s<out_t>(value, static_cast<out_t>(input_zp));
-        weight = apply_sub_s<out_t>(weight, static_cast<out_t>(weight_zp));
-        acc = apply_add_s<out_t>(acc, apply_mul_s<out_t>(value, weight));
-        tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
+for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= dy < stride_y, 0 <= dx < stride_x, 0 <= oc < OC) {
+    acc_t acc = 0;
+    index_t oy = iy * stride_y + dy + out_pad_top;
+    index_t ox = ix * stride_x + dx + out_pad_left;
+
+    for_each(0 <= sy * stride_y < KY - dy, 0 <= sx * stride_x < KX - dx, 0 <= ic < IC) {
+        index_t y = iy - sy;
+        index_t x = ix - sx;
+        index_t ky = dy + sy * stride_y;
+        index_t kx = dx + sx * stride_x;
+        acc_t value = static_cast<acc_t>(tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic]));
+        acc_t weight_value = static_cast<acc_t>(tensor_read<weight_t>(weight, [OH,KH,KW,IC], [oc,ky,kx,ic]));
+        value = apply_sub_s<acc_t>(value, static_cast<acc_t>(input_zp));
+        weight_value = apply_sub_s<acc_t>(weight_value, static_cast<acc_t>(weight_zp));
+        acc = apply_add_s<acc_t>(acc, apply_mul_s<acc_t>(value, weight_value));
     }
+
+    out_t out = static_cast<out_t>(acc);
+    out = apply_add_s<out_t>(out, bias[(BC == 1) ? 0 : oc]);
+    tensor_write<out_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], out);
 }
-- 
cgit v1.2.1