From 830b43b1d1bd82edd57dee1f5cac12e2b5cf0e04 Mon Sep 17 00:00:00 2001
From: Dominic Symes <dominic.symes@arm.com>
Date: Tue, 9 May 2023 10:14:49 +0100
Subject: Add DIM operator and operations on shape_t values

Shape inference derives the shape of tensors in
the graph from input shapes. Operations such as RESHAPE
may need calculations to derive the new tensor shape.
This patch:

- Adds a DIM operator to get the size of a tensor in
  a given axis as a rank 0 tensor of type shape_t

- Allows RESHAPE to take a 1D shape tensor as input for
  the new shape

- Allows RESIZE, TILE, PAD to take input sizes based
  on shape tensors.

- Allows ADD, SUB, MUL, INTDIV to operate on rank 0
  shape_t tensors

- Allows CONCAT to concatenate 0D shape_t tensors to
  a 1D shape_t tensor

- Adds CONST support for shape_t tensors

In this version of the specification shape tensors must
be resolvable to constants at backend compile time.

Signed-off-by: Dominic Symes <dominic.symes@arm.com>
Change-Id: I484bd44452453b5e05d0d8a82689564587b224e4
---
 chapters/data_layout.adoc  |  23 +++++++--
 chapters/ewise_binary.adoc | 123 ++++++++++++++++++++++++++++-----------------
 chapters/introduction.adoc |  18 +++++--
 chapters/pseudocode.adoc   |   6 +++
 tosa.xml                   |  60 +++++++++++++++++++---
 tosa.xsd                   |   1 +
 6 files changed, 168 insertions(+), 63 deletions(-)

diff --git a/chapters/data_layout.adoc b/chapters/data_layout.adoc
index 2dc7057..2d48eb1 100644
--- a/chapters/data_layout.adoc
+++ b/chapters/data_layout.adoc
@@ -17,13 +17,14 @@ include::{generated}/operators/CONCAT.adoc[]
 
 [source,c]
 ----
-ERROR_IF(axis < 0 || axis >= rank(shapes1[0]));
-ERROR_IF(shape[axis] != sum(shape1[k][axis] for all k))
+ERROR_IF(axis < 0 || axis >= max(1,rank(shapes1[0])));
+ERROR_IF(shape[axis] != sum(shape_dim(shapes1[k], axis) for all k))
+ERROR_IF(in_out_t == shape_t && rank(shape) > 1);
 // The following checks ensure all inputs are compatible for concatenation
 for_each(input_shape in shapes1) {
     ERROR_IF(rank(input_shape) != rank(shapes1[0]));
     for_each(index in input_shape) {
-        ERROR_IF(input_shape[index] != shapes1[0][index] && index != axis);
+        ERROR_IF(index != axis && input_shape[index] != shapes1[0][index]);
     }
 }
 for_each(index1 in shape) {
@@ -32,11 +33,11 @@ for_each(index1 in shape) {
         // Continue to concatenate along axis from each tensor
         // For each output location, we are looking for the
         // appropriate input tensor
-        if (index2[axis] >= 0 && index2[axis] < shapes1[t][axis]) {
+        if (index2[axis] >= 0 && index2[axis] < shape_dim(shapes1[t], axis)) {
             in_out_t value = tensor_read<in_out_t>(input1[t], shapes1[t], index2);
             tensor_write<in_out_t>(output, shape, index1, value);
         }
-        index2[axis] = index2[axis] - shapes1[t][axis];
+        index2[axis] = index2[axis] - shape_dim(shapes1[t], axis);
     }
 }
 
@@ -72,6 +73,18 @@ for_each(index in shape) {
 }
 ----
 
+==== DIM
+
+Returns a rank 0 tensor of the size of the input tensor for the given axis.
+
+include::{generated}/operators/DIM.adoc[]
+
+[source,c++]
+----
+ERROR_IF(axis >= rank(shape));
+tensor_write<shape_t>(output, [], [], shape_dim(shape, axis));
+----
+
 ==== RESHAPE
 
 Returns a tensor with the same type/values as the input, with a new shape specified by the shape argument. Reshape may operate on tensors of any rank. No data conversion happens during a reshape operation.
diff --git a/chapters/ewise_binary.adoc b/chapters/ewise_binary.adoc
index 864cf5b..35e454a 100644
--- a/chapters/ewise_binary.adoc
+++ b/chapters/ewise_binary.adoc
@@ -18,14 +18,22 @@ include::{generated}/operators/ADD.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(shape != broadcast_shape(shape1, shape2));
-for_each(index in shape) {
-    dim_t index1 = apply_broadcast(shape, shape1, index);
-    dim_t index2 = apply_broadcast(shape, shape2, index);
-    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
-    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
-    in_out_t result = apply_add<in_out_t>(value1, value2);
-    tensor_write<in_out_t>(output, shape, index, result);
+if (in_out_t == shape_t) {
+    ERROR_IF(rank(shape) != 0 || rank(shape1) != 0 || rank(shape2) != 0);
+    shape_t value1 = tensor_read<shape_t>(input1, [], []);
+    shape_t value2 = tensor_read<shape_t>(input2, [], []);
+    shape_t result = apply_add<shape_t>(value1, value2);
+    tensor_write<shape_t>(output, [], [], result);
+} else {
+    ERROR_IF(shape != broadcast_shape(shape1, shape2));
+    for_each(index in shape) {
+        dim_t index1 = apply_broadcast(shape, shape1, index);
+        dim_t index2 = apply_broadcast(shape, shape2, index);
+        in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+        in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+        in_out_t result = apply_add<in_out_t>(value1, value2);
+        tensor_write<in_out_t>(output, shape, index, result);
+    }
 }
 ----
 
@@ -131,18 +139,27 @@ include::{generated}/operators/INTDIV.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(shape != broadcast_shape(shape1, shape2));
-for_each(index in shape) {
-    dim_t index1 = apply_broadcast(shape, shape1, index);
-    dim_t index2 = apply_broadcast(shape, shape2, index);
-    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
-    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+if (in_out_t == shape_t) {
+    ERROR_IF(rank(shape) != 0 || rank(shape1) != 0 || rank(shape2) != 0);
+    shape_t value1 = tensor_read<shape_t>(input1, [], []);
+    shape_t value2 = tensor_read<shape_t>(input2, [], []);
     REQUIRE(value2 != 0);
-    // This catches the case where we divide minimum<in_out_t> by -1
-    // which is not representable in two's complement
-    REQUIRE((int64_t)value1 / value2 <= maximum<in_out_t>);
-    in_out_t result = value1 / value2;
-    tensor_write<in_out_t>(output, shape, index, result);
+    shape_t result = value1 / value2;
+    tensor_write<shape_t>(output, [], [], result);
+} else {
+    ERROR_IF(shape != broadcast_shape(shape1, shape2));
+    for_each(index in shape) {
+        dim_t index1 = apply_broadcast(shape, shape1, index);
+        dim_t index2 = apply_broadcast(shape, shape2, index);
+        in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+        in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+        REQUIRE(value2 != 0);
+        // This catches the case where we divide minimum<in_out_t> by -1
+        // which is not representable in two's complement
+        REQUIRE((int64_t)value1 / value2 <= maximum<in_out_t>);
+        in_out_t result = value1 / value2;
+        tensor_write<in_out_t>(output, shape, index, result);
+    }
 }
 ----
 
@@ -297,25 +314,33 @@ include::{generated}/operators/MUL.adoc[]
 
 [source,c++]
 ----
-REQUIRE(0 <= shift && shift <= 63);
-REQUIRE(in_t == int32_t || shift == 0);
-ERROR_IF(shape != broadcast_shape(shape1, shape2));
-for_each(index in shape) {
-    dim_t index1 = apply_broadcast(shape, shape1, index);
-    dim_t index2 = apply_broadcast(shape, shape2, index);
-    in_t value1 = tensor_read<in_t>(input1, shape1, index1);
-    in_t value2 = tensor_read<in_t>(input2, shape2, index2);
-    out_t result;
-    if (in_t == int32_t && shift > 0) {
-        int64_t product = (int64_t)value1 * (int64_t)value2;
-        int64_t round   = (int64_t)1 << (shift-1);
-        product = (product + round) >> shift;
-        REQUIRE(product >= minimum<int32_t> && product <= maximum<int32_t>)
-        result = product;
-    } else {
-        result = value1 * value2;  // low 32-bits of result for int32_t
+if (in_out_t == shape_t) {
+    ERROR_IF(rank(shape) != 0 || rank(shape1) != 0 || rank(shape2) != 0);
+    shape_t value1 = tensor_read<shape_t>(input1, [], []);
+    shape_t value2 = tensor_read<shape_t>(input2, [], []);
+    shape_t result = value1 * value2;
+    tensor_write<shape_t>(output, [], [], result);
+} else {
+    REQUIRE(0 <= shift && shift <= 63);
+    REQUIRE(in_t == int32_t || shift == 0);
+    ERROR_IF(shape != broadcast_shape(shape1, shape2));
+    for_each(index in shape) {
+        dim_t index1 = apply_broadcast(shape, shape1, index);
+        dim_t index2 = apply_broadcast(shape, shape2, index);
+        in_t value1 = tensor_read<in_t>(input1, shape1, index1);
+        in_t value2 = tensor_read<in_t>(input2, shape2, index2);
+        out_t result;
+        if (in_t == int32_t && shift > 0) {
+            int64_t product = (int64_t)value1 * (int64_t)value2;
+            int64_t round   = (int64_t)1 << (shift-1);
+            product = (product + round) >> shift;
+            REQUIRE(product >= minimum<int32_t> && product <= maximum<int32_t>)
+            result = product;
+        } else {
+            result = value1 * value2;  // low 32-bits of result for int32_t
+        }
+        tensor_write<out_t>(output, shape, index, result);
     }
-    tensor_write<out_t>(output, shape, index, result);
 }
 ----
 
@@ -348,14 +373,22 @@ include::{generated}/operators/SUB.adoc[]
 
 [source,c++]
 ----
-ERROR_IF(shape != broadcast_shape(shape1, shape2));
-for_each(index in shape) {
-    dim_t index1 = apply_broadcast(shape, shape1, index);
-    dim_t index2 = apply_broadcast(shape, shape2, index);
-    in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
-    in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
-    in_out_t result = apply_sub<in_out_t>(value1, value2);
-    tensor_write<in_out_t>(output, shape, index, result);
+if (in_out_t == shape_t) {
+    ERROR_IF(rank(shape) != 0 || rank(shape1) != 0 || rank(shape2) != 0);
+    shape_t value1 = tensor_read<shape_t>(input1, [], []);
+    shape_t value2 = tensor_read<shape_t>(input2, [], []);
+    shape_t result = apply_sub<shape_t>(value1, value2);
+    tensor_write<shape_t>(output, [], [], result);
+} else {
+    ERROR_IF(shape != broadcast_shape(shape1, shape2));
+    for_each(index in shape) {
+        dim_t index1 = apply_broadcast(shape, shape1, index);
+        dim_t index2 = apply_broadcast(shape, shape2, index);
+        in_out_t value1 = tensor_read<in_out_t>(input1, shape1, index1);
+        in_out_t value2 = tensor_read<in_out_t>(input2, shape2, index2);
+        in_out_t result = apply_sub<in_out_t>(value1, value2);
+        tensor_write<in_out_t>(output, shape, index, result);
+    }
 }
 ----
 
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index 62a9b2c..0765e95 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -359,14 +359,20 @@ Tensors have metadata associated with them that describe characteristics of the
 
 The number of dimensions in a shape is called the rank.
 A tensor with rank equal to zero is permitted.
-In that case, the tensor has a single entry.
+In that case, the tensor has a single entry and is also known as a scalar.
 A tensor shape is an array of integers of size equal to the rank of the tensor.
 Each element in the tensor shape describes the number of elements in the dimension.
 The tensor shape in each dimension must be greater than or equal to 1.
 For tensor access information, see <<Tensor Access Helpers>>.
-Tensor dimensions are given in the pseudocode as type dim_t.
-dim_t is a vector of index_t values, with the length of the vector defining the rank of the tensor.
-Tensor elements are addressed using dim_t values, where each element of the vector indicates the offset of the requested element within the corresponding dimension.
+
+The shape of a tensor of non-zero rank is itself a tensor of rank 1 with elements of type shape_t.
+The single dimension has size which is the rank of the original tensor.
+In this specification a shape-tensor means a rank 1 tensor with elements of type shape_t.
+The components of a shape tensor are rank 0 tensors of type shape_t.
+
+Some operations can process rank 0 or rank 1 tensors of type shape_t.
+For these operations, shape_t is permitted as an input or output tensor data type.
+In this version of the specification, shape_t values must be resolvable to constants at backend compile time.
 
 ==== Tensor size limit
 
@@ -379,6 +385,8 @@ This type must be able to hold integers in the range 0 to (1++<<++MAX_LOG2_SIZE)
 This means that the maximum size of a tensor along each dimension is (1<<MAX_LOG2_SIZE) - 1 and therefore the maximum coordinate value is (1<<MAX_LOG2_SIZE) - 2.
 Indices used to access tensors must be non-negative.
 
+The type shape_t, used in shape tensors, must be able to hold integers in the range -(1++<<++MAX_LOG2_SIZE) to (1++<<++MAX_LOG2_SIZE) - 1.
+
 ==== Data Layouts
 
 The following data layouts are supported in TOSA.
@@ -558,7 +566,7 @@ The values to achieve a scaling of 1.0 are shift=30, multiplier=1<<30 for apply_
 int32_t apply_scale_32(int32_t value, int32_t multiplier, int8_t shift, bool_t double_round=false) {
     REQUIRE(multiplier >= 0);
     REQUIRE(2 <= shift && shift <= 62);
-    REQUIRE(value >= (-1 << (shift - 1)) && value < (1 << (shift - 1));
+    REQUIRE(value >= (-1 << (shift - 1)) && value < (1 << (shift - 1)));
     int64_t round = 1 << (shift - 1);
     if (double_round) {
         if (shift > 31 && value >= 0) round += 1<<30;
diff --git a/chapters/pseudocode.adoc b/chapters/pseudocode.adoc
index 146b5d7..c026089 100644
--- a/chapters/pseudocode.adoc
+++ b/chapters/pseudocode.adoc
@@ -94,6 +94,12 @@ size_t tensor_size(dim_t shape) {
     }
     return size;
 }
+
+// Return the size of the tensor in the given axis
+// For a rank=0 tensor, returns 1 for all axes
+size_t shape_dim(dim_t shape, int axis) {
+    return (axis >= rank(shape)) ? 1 : shape[axis];
+}
 ----
 
 ==== Tensor Read
diff --git a/tosa.xml b/tosa.xml
index 40128f7..d3889a2 100644
--- a/tosa.xml
+++ b/tosa.xml
@@ -773,6 +773,7 @@
           <type name='in_out_t'/>
         </types>
         <typesupport mode="signed 32" in_out_t="int32_t"/>
+        <typesupport mode="shape" in_out_t="shape_t"/>
         <typesupport mode="fp16" in_out_t="fp16_t" >
           <profile name="MI"/>
           <profile name="MT"/>
@@ -907,6 +908,7 @@
           <type name='in_out_t'/>
         </types>
         <typesupport mode="signed 32" in_out_t="int32_t"/>
+        <typesupport mode="shape" in_out_t="shape_t"/>
       </operator>
       <operator>
         <name>LOGICAL_AND</name>
@@ -1118,6 +1120,7 @@
         <typesupport mode="signed 8" in_t="int8_t" out_t="int32_t"/>
         <typesupport mode="signed 16" in_t="int16_t" out_t="int32_t"/>
         <typesupport mode="signed 32" in_t="int32_t" out_t="int32_t"/>
+        <typesupport mode="shape" in_t="shape_t" out_t="shape_t"/>
         <typesupport mode="fp16" in_t="fp16_t" out_t="fp16_t">
           <profile name="MI"/>
           <profile name="MT"/>
@@ -1185,6 +1188,7 @@
           <type name='in_out_t'/>
         </types>
         <typesupport mode="signed 32" in_out_t="int32_t"/>
+        <typesupport mode="shape" in_out_t="shape_t"/>
         <typesupport mode="fp16" in_out_t="fp16_t" >
           <profile name="MI"/>
           <profile name="MT"/>
@@ -1863,7 +1867,7 @@
         <arguments>
           <argument category="input" name="input1" type="tensor_list_t" shape="shapes1" tensor-element-type="in_out_t">
             <description>List of input tensors. All inputs must have the same rank and data type</description>
-            <rank min="1" max="MAX_RANK"/>
+            <rank min="0" max="MAX_RANK"/>
           </argument>
           <argument category="attribute" name="axis" type="tensor_t" shape="-" tensor-element-type="int32_t">
             <description>Axis along which concatenation is to occur, in range from 0 to rank(shape)-1</description>
@@ -1882,6 +1886,7 @@
         <typesupport mode="signed 8" in_out_t="int8_t"/>
         <typesupport mode="signed 16" in_out_t="int16_t"/>
         <typesupport mode="signed 32" in_out_t="int32_t"/>
+        <typesupport mode="shape" in_out_t="shape_t"/>
         <typesupport mode="fp16" in_out_t="fp16_t">
           <profile name="MI"/>
           <profile name="MT"/>
@@ -1902,7 +1907,7 @@
             <description>Input tensor</description>
             <rank min="1" max="MAX_RANK"/>
           </argument>
-          <argument category="attribute" name="padding" type="tensor_t" shape="[rank(shape1),2]" tensor-element-type="int32_t">
+          <argument category="input" name="padding" type="tensor_t" shape="[rank(shape1),2]" tensor-element-type="shape_t">
             <description>Number of pad elements at the start and end of each dimension</description>
             <rank min="2" max="2"/>
           </argument>
@@ -1936,6 +1941,43 @@
           <profile name="MT"/>
         </typesupport>
       </operator>
+      <operator>
+        <name>DIM</name>
+        <arguments>
+          <argument category="input" name="input1" type="tensor_t" shape="shape" tensor-element-type="in_t">
+            <description>Input tensor</description>
+            <levellimit value="rank(shape)" limit="MAX_RANK"/>
+            <rank min="1" max="MAX_RANK"/>
+          </argument>
+          <argument category="attribute" name="axis" type="tensor_t" shape="-" tensor-element-type="int32_t">
+            <description>Axis in range from 0 to rank(shape) - 1</description>
+            <rank min="0" max="0"/>
+          </argument>
+          <argument category="output" name="output" type="tensor_t" shape="-" tensor-element-type="shape_t" >
+            <description>Output rank 0 tensor giving the size of the shape for the given axis</description>
+            <rank min="0" max="0"/>
+          </argument>
+        </arguments>
+        <types>
+          <type name='in_t'/>
+        </types>
+        <typesupport mode="boolean" in_t="bool_t"/>
+        <typesupport mode="signed 8" in_t="int8_t"/>
+        <typesupport mode="signed 16" in_t="int16_t"/>
+        <typesupport mode="signed 32" in_t="int32_t"/>
+        <typesupport mode="fp16" in_t="fp16_t">
+          <profile name="MI"/>
+          <profile name="MT"/>
+        </typesupport>
+        <typesupport mode="bf16" in_t="bf16_t">
+          <profile name="MI"/>
+          <profile name="MT"/>
+        </typesupport>
+        <typesupport mode="fp32" in_t="fp32_t">
+          <profile name="MI"/>
+          <profile name="MT"/>
+        </typesupport>
+      </operator>
       <operator>
         <name>RESHAPE</name>
         <arguments>
@@ -1944,8 +1986,8 @@
             <levellimit value="rank(shape1)" limit="MAX_RANK"/>
             <rank min="1" max="MAX_RANK"/>
           </argument>
-          <argument category="attribute" name="new_shape" type="tensor_t" shape="[rank(shape)]" tensor-element-type="int32_t">
-            <description>List of values, with each element giving the size of the result tensor for the given dimension. At most one dimension may be given as -1 to automatically calculate the dimension size.</description>
+          <argument category="input" name="shape" type="tensor_t" shape="[rank(shape)]" tensor-element-type="shape_t">
+            <description>1D shape tensor giving the new shape.</description>
             <rank min="1" max="1"/>
           </argument>
           <argument category="output" name="output" type="tensor_t" shape="shape" tensor-element-type="in_out_t">
@@ -1998,6 +2040,7 @@
         <typesupport mode="signed 8" in_out_t="int8_t"/>
         <typesupport mode="signed 16" in_out_t="int16_t"/>
         <typesupport mode="signed 32" in_out_t="int32_t"/>
+        <typesupport mode="shape" in_out_t="shape_t"/>
         <typesupport mode="fp16" in_out_t="fp16_t">
           <profile name="MI"/>
           <profile name="MT"/>
@@ -2060,7 +2103,7 @@ used.</description>
             <description>Input tensor</description>
             <rank min="1" max="MAX_RANK"/>
           </argument>
-          <argument category="attribute" name="multiples" type="tensor_t" shape="[rank(shape1)]" tensor-element-type="int32_t">
+          <argument category="input" name="multiples" type="tensor_t" shape="[rank(shape1)]" tensor-element-type="shape_t">
             <description>Number of times to replicate input1 in each dimension</description>
             <rank min="1" max="1"/>
           </argument>
@@ -2212,17 +2255,17 @@ used.</description>
             <description>Input tensor</description>
             <rank min="4" max="4"/>
           </argument>
-          <argument category="attribute" name="scale" type="tensor_t" shape="[4]" tensor-element-type="int16_t">
+          <argument category="input" name="scale" type="tensor_t" shape="[4]" tensor-element-type="shape_t">
             <description>[scale_y_n, scale_y_d, scale_x_n, scale_x_d]</description>
             <levellimit value="scale_y_n/scale_y_d" limit="MAX_SCALE"/>
             <levellimit value="scale_x_n/scale_x_d" limit="MAX_SCALE"/>
             <rank min="1" max="1"/>
           </argument>
-          <argument category="attribute" name="offset" type="tensor_t" shape="[2]" tensor-element-type="int16_t">
+          <argument category="input" name="offset" type="tensor_t" shape="[2]" tensor-element-type="shape_t">
             <description>[offset_y, offset_x]</description>
             <rank min="1" max="1"/>
           </argument>
-          <argument category="attribute" name="border" type="tensor_t" shape="[2]" tensor-element-type="int16_t">
+          <argument category="input" name="border" type="tensor_t" shape="[2]" tensor-element-type="shape_t">
             <description>[border_y, border_x]</description>
             <rank min="1" max="1"/>
           </argument>
@@ -2464,6 +2507,7 @@ used.</description>
         <typesupport mode="signed 16" out_t="int16_t" />
         <typesupport mode="signed 32" out_t="int32_t" />
         <typesupport mode="signed 48" out_t="int48_t" />
+        <typesupport mode="shape" out_t="shape_t" />
         <typesupport mode="fp16" out_t="fp16_t" >
           <profile name="MI"/>
           <profile name="MT"/>
diff --git a/tosa.xsd b/tosa.xsd
index b6aa162..440dbbd 100644
--- a/tosa.xsd
+++ b/tosa.xsd
@@ -43,6 +43,7 @@
     <xs:enumeration value="fp16_t"/>
     <xs:enumeration value="bf16_t"/>
     <xs:enumeration value="fp32_t"/>
+    <xs:enumeration value="shape_t"/>
   </xs:restriction>
 </xs:simpleType>
 
-- 
cgit v1.2.1