From 9151dd5b223ea069e4c9526e9e82442edaf525d0 Mon Sep 17 00:00:00 2001
From: Dominic Symes <dominic.symes@arm.com>
Date: Fri, 3 Nov 2023 14:29:51 +0000
Subject: REDUCE_SUM: Specify the accumulator type

For bf16_t data type, REDUCE_SUM is changed to use fp32_t
as the accumulator type to be consistent with CONV2D.
For other data types the accumulator type is the same
as in_out_t (and so no change).
Also correct the rank limit text.

Update reduction pseudo-code to be consistent with
REDUCE_SUM.

Change-Id: I9923066be7d1b7edb0efd9bcf3365b4af9501beb
Signed-off-by: Dominic Symes <dominic.symes@arm.com>
---
 chapters/reduction.adoc | 143 ++++++++++++++++++++++++++----------------------
 tosa.xml                |  11 ++--
 2 files changed, 85 insertions(+), 69 deletions(-)

diff --git a/chapters/reduction.adoc b/chapters/reduction.adoc
index 8a3ceac..19ff4ed 100644
--- a/chapters/reduction.adoc
+++ b/chapters/reduction.adoc
@@ -19,18 +19,19 @@ include::{generated}/operators/REDUCE_ALL.adoc[]
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
-
-// Initialize output state to true
-for_each(index in shape) {
-    tensor_write<in_out_t>(output, shape, index, true);
-}
-for_each(index in shape1) {
-    dim_t out_index = index;
-    out_index[axis] = 0;
-    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
-    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
-    state      = state && value;
-    tensor_write<in_out_t>(output, shape, out_index, state);
+left_shape  = (axis > 1) ? shape[0:axis-1] : [];
+right_shape = (axis < rank(shape)-1) ? shape[axis+1:rank(shape)-1] : [];
+for_each(left_index in left_shape) {
+    for_each(right_index in right_shape) {
+        in_out_t acc = true;
+        for (i = 0; i < shape1[axis]; i++) {
+            index = flatten(left_index, [i], right_index);
+            in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+            acc = acc && value;
+        }
+        out_index = flatten(left_index, [0], right_index);
+        tensor_write<in_out_t>(output, shape, out_index, acc);
+    }
 }
 ----
 
@@ -44,18 +45,19 @@ include::{generated}/operators/REDUCE_ANY.adoc[]
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
-
-// Initialize output state to false
-for_each(index in shape) {
-    tensor_write<in_out_t>(output, shape, index, false);
-}
-for_each(index in shape1) {
-    dim_t out_index = index;
-    out_index[axis] = 0;
-    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
-    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
-    state      = state || value;
-    tensor_write<in_out_t>(output, shape, out_index, state);
+left_shape  = (axis > 1) ? shape[0:axis-1] : [];
+right_shape = (axis < rank(shape)-1) ? shape[axis+1:rank(shape)-1] : [];
+for_each(left_index in left_shape) {
+    for_each(right_index in right_shape) {
+        in_out_t acc = false;
+        for (i = 0; i < shape1[axis]; i++) {
+            index = flatten(left_index, [i], right_index);
+            in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+            acc = acc || value;
+        }
+        out_index = flatten(left_index, [0], right_index);
+        tensor_write<in_out_t>(output, shape, out_index, acc);
+    }
 }
 ----
 
@@ -69,16 +71,19 @@ include::{generated}/operators/REDUCE_MAX.adoc[]
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
-for_each(index in shape) {
-    tensor_write<in_out_t>(output, shape, index, minimum<in_out_t>);
-}
-for_each(index in shape1) {
-    dim_t out_index = index;
-    out_index[axis] = 0;
-    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
-    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
-    state      = apply_max_s<in_out_t>(state, value);
-    tensor_write<in_out_t>(output, shape, out_index, state);
+left_shape  = (axis > 1) ? shape[0:axis-1] : [];
+right_shape = (axis < rank(shape)-1) ? shape[axis+1:rank(shape)-1] : [];
+for_each(left_index in left_shape) {
+    for_each(right_index in right_shape) {
+        in_out_t acc = minimum<in_out_t>;
+        for (i = 0; i < shape1[axis]; i++) {
+            index = flatten(left_index, [i], right_index);
+            in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+            acc = apply_max_s<in_out_t>(acc, value);
+        }
+        out_index = flatten(left_index, [0], right_index);
+        tensor_write<in_out_t>(output, shape, out_index, acc);
+    }
 }
 ----
 
@@ -92,16 +97,19 @@ include::{generated}/operators/REDUCE_MIN.adoc[]
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
-for_each(index in shape) {
-    tensor_write<in_out_t>(output, shape, index, maximum<in_out_t>);
-}
-for_each(index in shape1) {
-    dim_t out_index = index;
-    out_index[axis] = 0;
-    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
-    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
-    state      = apply_min_s<in_out_t>(state, value);
-    tensor_write<in_out_t>(output, shape, out_index, state);
+left_shape  = (axis > 1) ? shape[0:axis-1] : [];
+right_shape = (axis < rank(shape)-1) ? shape[axis+1:rank(shape)-1] : [];
+for_each(left_index in left_shape) {
+    for_each(right_index in right_shape) {
+        in_out_t acc = maximum<in_out_t>;
+        for (i = 0; i < shape1[axis]; i++) {
+            index = flatten(left_index, [i], right_index);
+            in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+            acc = apply_min_s<in_out_t>(acc, value);
+        }
+        out_index = flatten(left_index, [0], right_index);
+        tensor_write<in_out_t>(output, shape, out_index, out);
+    }
 }
 ----
 
@@ -115,16 +123,19 @@ include::{generated}/operators/REDUCE_PRODUCT.adoc[]
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
-for_each(index in shape) {
-    tensor_write<in_out_t>(output, shape, index, 1.0);
-}
-for_each(index in shape1) {
-    dim_t out_index = index;
-    out_index[axis] = 0;
-    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
-    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
-    state      = apply_mul_s<in_out_t>(state, value);
-    tensor_write<in_out_t>(output, shape, out_index, state);
+left_shape  = (axis > 1) ? shape[0:axis-1] : [];
+right_shape = (axis < rank(shape)-1) ? shape[axis+1:rank(shape)-1] : [];
+for_each(left_index in left_shape) {
+    for_each(right_index in right_shape) {
+        in_out_t acc = 1.0;
+        for (i = 0; i < shape1[axis]; i++) {
+            index = flatten(left_index, [i], right_index);
+            in_out_t value = tensor_read<in_out_t>(input, shape1, index);
+            acc = apply_mul_s<in_out_t>(acc, value);
+        }
+        out_index = flatten(left_index, [0], right_index);
+        tensor_write<in_out_t>(output, shape, out_index, acc);
+    }
 }
 ----
 
@@ -138,15 +149,19 @@ include::{generated}/operators/REDUCE_SUM.adoc[]
 ----
 ERROR_IF(axis < 0  || axis >= rank(shape1));
 ERROR_IF(shape[axis] != 1);
-for_each(index in shape) {
-    tensor_write<in_out_t>(output, shape, index, 0);
-}
-for_each(index in shape1) {
-    dim_t out_index = index;
-    out_index[axis] = 0;
-    in_out_t value = tensor_read<in_out_t>(input, shape1, index);
-    in_out_t state = tensor_read<in_out_t>(output, shape, out_index);
-    state          = apply_add_s<in_out_t>(state, value);
-    tensor_write<in_out_t>(output, shape, out_index, state);
+left_shape  = (axis > 1) ? shape[0:axis-1] : [];
+right_shape = (axis < rank(shape)-1) ? shape[axis+1:rank(shape)-1] : [];
+for_each(left_index in left_shape) {
+    for_each(right_index in right_shape) {
+        acc_t acc = 0;
+        for (i = 0; i < shape1[axis]; i++) {
+            index = flatten(left_index, [i], right_index);
+            acc_t value = tensor_read<in_out_t>(input, shape1, index);
+            acc = apply_add_s<acc_t>(acc, value);
+        }
+        out_index = flatten(left_index, [0], right_index);
+        in_out_t result = static_cast<in_out_t>(acc);
+        tensor_write<in_out_t>(output, shape, out_index, result);
+    }
 }
 ----
diff --git a/tosa.xml b/tosa.xml
index fa67c1c..e5a0872 100644
--- a/tosa.xml
+++ b/tosa.xml
@@ -1880,7 +1880,7 @@
         <name>REDUCE_SUM</name>
         <arguments>
           <argument category="input" name="input" type="tensor_t" shape="shape1" tensor-element-type="in_out_t">
-            <description>Input tensor with rank from 1 to 4</description>
+            <description>Input tensor</description>
             <rank min="1" max="MAX_RANK"/>
           </argument>
           <argument category="attribute" name="axis" type="tensor_t" shape="-" tensor-element-type="i32_t">
@@ -1894,17 +1894,18 @@
         </arguments>
         <types>
           <type name='in_out_t'/>
+          <type name='acc_t'/>
         </types>
-        <typesupport mode="signed 32" in_out_t="i32_t"/>
-        <typesupport mode="fp16" in_out_t="fp16_t">
+        <typesupport mode="signed 32" in_out_t="i32_t" acc_t="i32_t"/>
+        <typesupport mode="fp16" in_out_t="fp16_t" acc_t="fp16_t">
           <profile name="MI"/>
           <profile name="MT"/>
         </typesupport>
-        <typesupport mode="bf16" in_out_t="bf16_t">
+        <typesupport mode="bf16" in_out_t="bf16_t" acc_t="fp32_t">
           <profile name="MI"/>
           <profile name="MT"/>
         </typesupport>
-        <typesupport mode="fp32" in_out_t="fp32_t">
+        <typesupport mode="fp32" in_out_t="fp32_t" acc_t="fp32_t">
           <profile name="MI"/>
           <profile name="MT"/>
         </typesupport>
-- 
cgit v1.2.1