From 5b936a3c5c335baab178edecf4c1da09b9a86707 Mon Sep 17 00:00:00 2001
From: Dominic Symes <dominic.symes@arm.com>
Date: Wed, 1 Mar 2023 11:34:40 +0000
Subject: Main inference compliance testing updates

- Add additional main inference compliance test
data sets for dot product testing in Appendix A.

- Express dot product test criteria in terms of
tensors rather than individual dot products.

- Add per-operation details on test set generation
in Appendix A.

- Clarify compliance vs conformance wording.

- Clarify that the comment in the table of section 1.8.2
on integer saturation applies to the CAST from
floating point to integer operation.

Change-Id: I1b4c4493b02ed7c8a6eb547656c91ca67d4b0e86
Signed-off-by: Dominic Symes <dominic.symes@arm.com>
---
 chapters/appendix_a.adoc   | 274 ++++++++++++++++++++++++++++++++++++++++-----
 chapters/introduction.adoc | 112 ++++++++++++------
 tools/dictionary.dic       |   2 +
 3 files changed, 324 insertions(+), 64 deletions(-)

diff --git a/chapters/appendix_a.adoc b/chapters/appendix_a.adoc
index 33a4f11..f601d5d 100644
--- a/chapters/appendix_a.adoc
+++ b/chapters/appendix_a.adoc
@@ -30,37 +30,257 @@ float set_data(uint32_t set, uint32_t index)
 }
 ----
 
-=== Dot product floating-point test data sets
+=== Main Inference test data generator
 
-Each test set is indexed by a pair (S, N) where:
+This section describes the function tosa_mi_data(S, KS, p, k, i) that generates test data for main inference compliance.
+This function takes the following arguments:
 
-* S is the test set number
-* N is the number of elements in a single test vector
+* S is the test set number which identifies which generator is used
+* KS is the kernel size
+* p is the parameter number of 0 for the first input (usually data) and 1 for the second input (usually weights)
+* k is the index within the kernel in the range 0 \<= k < KS
+* i is the index within the tensor to write
 
-Each test set (S, N) contains multiple tests that statistics are calculated over.
-The parameter T is the number of tests in a given set.
-In the table below, t is the test number within a set in the range 0 to T-1.
+Some test data values are scaled by the bound parameter B which is defined in the table below.
+B is set to be the largest value that is both representable by the input type and such that B*B does not overflow the accumulator precision.
 
-[cols="1,1,1,5,5"]
 |===
-| Set S | N range | T | x[k] formula for k < N | w[k] formula for k < N
-
-| 0
-| 2-25,50,100,1000
-| 10
-| x[k]=set_data(S, 2*t*N+2*k) < 0 ? 0.0 : set_data(S, 2*t*N+2*k+1)
-| w[k]=set_data(S, 2*t*N+2*k) < 0 ? set_data(S, 2*t*N+2*k+1) : 0.0
-
-| 1
-| 2-25,50,100,1000
-| 1000
-| x[k]=2.0*set_data(S,  2*t*N + k)
-| w[k]=2.0*set_data(S, (2*t+1)*N + k)
-
-| 2
-| 2-25,50,100,1000
-| 1000
-| x[0]=1.0, x[k]=set_data(S, 2*t*N + k)/sqrt(N) for k>0
-| w[0]=1.0, w[k]=set_data(S, (2*t+1)*N + k)/sqrt(N) for k>0
+| inputs type | accumulator type | B value
+| fp16        | fp16             | (1<<8)  - (1/8)  = 255.875
+| fp16        | fp32             | (1<<16) - (1<<5) = 65504
+| bf16        | fp32             | (1<<64) - (1<<56)
+| fp32        | fp32             | (1<<64) - (1<<40)
+|===
+
+==== Test set S=0 generator
+
+The aim of this generator is to check that sum of products with zero gives zero result.
+
+[cols="1,9"]
+|===
+| p | tosa_mi_data(S, KS, p, k, i) =
+| 0 | set_data(2*S, i) < 0 ? 0.0 : set_data(2*S+1, i)
+| 1 | set_data(2*S, i) < 0 ? set_data(2*S+1, i) : 0.0
+|===
+
+==== Test set S=1
+
+The aim of this test set is to check values with large exponents.
+
+[cols="1,9"]
+|===
+| p | tosa_mi_data(S, KS, p, k, i) =
+| 0 | (B/sqrt(N))*(0.75 + 0.25*set_data(2*S+0, i))
+| 1 | (B/sqrt(N))*(0.75 + 0.25*set_data(2*S+1, i))
+|===
+
+==== Test set S=2
+
+The aim of this test set is to check rounding error when accumulating small values onto a large value.
+In this case the small values are of similar magnitude.
+If the implementation changes the order of the sum, then the test data must also be reordered so that the largest values occur first in the sum.
+
+[cols="1,9"]
+|===
+| p | tosa_mi_data(S, KS, p, k, i) =
+| 0 | (k==0) ? 1.0 : set_data(2*S+0, i)/sqrt(KS)
+| 1 | (k==0) ? 1.0 : set_data(2*S+1, i)/sqrt(KS)
+|===
+
+==== Test set S=3
+
+The aim of this test set is to check rounding error when accumulating small values onto a large value.
+In this case the small values are of varying magnitude.
+If the implementation changes the order of the sum, then the test data must also be reordered so that the largest values occur first in the sum.
+
+[cols="1,9"]
+|===
+| p | tosa_mi_data(S, KS, p, k, i) =
+| 0 | (k==0) ? 16.0 : exp(2*set_data(2*S+0, 2*i+0)) * set_data(2*S+0, 2*i+1)
+| 1 | (k==0) ? 16.0 : exp(2*set_data(2*S+1, 2*i+0)) * set_data(2*S+1, 2*i+1)
+|===
+
+==== Test set S=4
+
+The aim of this test set is to check a mixture of zero and non-zero products.
+
+[cols="1,9"]
+|===
+| p | tosa_mi_data(S, KS, p, k, i) =
+| 0 | (k==KS/2) ? +0.5 : (set_data(2*S, i) < 0 ? 0.0 : B*set_data(2*S+1, i))
+| 1 | (k==KS/2) ? -0.5 : (set_data(2*S, i) < 0 ? B*set_data(2*S+1, i) : 0.0)
+|===
 
+==== Test set S=5
+
+The aim of this test set is to check signed inputs of large range.
+
+[cols="1,9"]
 |===
+| p | tosa_mi_data(S, KS, p, k, i) =
+| 0 | (B/sqrt(KS))*set_data(2*S+0, i)
+| 1 | (B/sqrt(KS))*set_data(2*S+1, i)
+|===
+
+=== Main Inference operator test data
+
+For each operator, this section defines how to generate test data for test set S.
+For the results to be statistically significant the operation must calculate at least MIN_DOT_PRODUCTS dot products.
+For most operations this means that the output tensor must have at least MIN_DOT_PRODUCTS output values.
+For most operations batch size can be increased if necessary so that this holds.
+For this version of the specification, MIN_DOT_PRODUCTS is set to 1000.
+
+==== CONV2D
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*OH*OW*OC >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = KW*KH*IC;
+for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= ic < IC) {
+  input [ n, iy, ix, ic] = tosa_mi_data(S, KS, 0, ((iy % KH)*KW+(ix % KW))*IC+ic, ((n*IH+iy)*IW+ix)*IC+ic);
+}
+for (0 <= oc < OC, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+  weight[oc, ky, kx, ic] = tosa_mi_data(S, KS, 1, (ky*KW+kx)*IC+ic, ((oc*KH+ky)*KW+kx)*IC+ic);
+}
+----
+
+==== CONV3D
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*OD*OH*OW*OC >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = KD*KW*KH*IC;
+for (0 <= n < N, 0 <= id < UD, 0 <= iy < IH, 0 <= ix < IW, 0 <= ic < IC) {
+  input [ n, id, iy, ix, ic] = tosa_mi_data(S, KS, 0, (((id % KD)*KH+(iy % KH))*KW+(ix % KW))*IC+ic, (((n*ID+id)*IH+iy)*IW+ix)*IC+ic);
+}
+for (0 <= oc < OC, 0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+  weight[oc, kd, ky, kx, ic] = tosa_mi_data(S, KS, 1, ((kd*KH+ky)*KW+kx)*IC+ic, (((oc*KD+kd)*KH+ky)*KW+kx)*IC+ic);
+}
+----
+
+==== DEPTHWISE_CONV2D
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*OH*OW*C*M >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = KW*KH*C;
+for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= c < C) {
+  input [ n, iy, ix, c] = tosa_mi_data(S, KS, 0, ((iy % KH)*KW+(ix % KW))*C+c, ((n*IH+iy)*IW+ix)*C+c);
+}
+for (0 <= ky < KH, 0 <= kx < KW, 0 <= c < C, 0 <= m < M) {
+  weight[ky, kx,  c, m] = tosa_mi_data(S, KS, 1, (ky*KW+kx)*C+c, ((ky*KW+kx)*C+c)*M+m);
+}
+----
+
+==== FULLY_CONNECTED
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*OC >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = IC;
+for (0 <= n < N, 0 <= ic < IC) {
+  input [ n, ic] = tosa_mi_data(S, KS, 0, ic,  n*IC+ic);
+}
+for (0 <= oc < OC, 0 <= ic < IC) {
+  weight[oc, ic] = tosa_mi_data(S, KS, 1, ic, oc*IC+ic);
+}
+----
+
+==== MATMUL
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*H*W >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = C;
+for (0 <= n < N, 0 <= y < H, 0 <= c < C) {
+  A[n, y, c] = tosa_mi_data(S, KS, 0, c, (n*H+y)*C+c);
+}
+for (0 <= n < N, 0 <= c < C, 0 <= x < W) {
+  B[n, c, x] = tosa_mi_data(S, KS, 1, c, (n*C+c)*W+x);
+}
+----
+
+==== TRANSPOSE_CONV2D
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*OH*OW*OC >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = KW*KH*IC;
+for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= ic < IC) {
+  input [ n, iy, ix, ic] = tosa_mi_data(S, KS, 0, ((iy % KH)*KW+(ix % KW))*IC+ic, ((n*IH+iy)*IW+ix)*IC+ic);
+}
+for (0 <= oc < OC, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+  weight[oc, ky, kx, ic] = tosa_mi_data(S, KS, 1, (ky*KW+kx)*IC+ic, ((oc*KH+ky)*KW+kx)*IC+ic);
+}
+----
+
+==== FFT2D
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*H*W >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = 2*H*W;
+for (0 <= n < N, 0 <= y < H, 0 <= x < W) {
+  input_real[n, y, x] = tosa_mi_data(S, KS, 0, y*W+x, ((0*N+n)*H+y)*IW+x);
+  input_imag[n, y, x] = tosa_mi_data(S, KS, 0, y*W+x, ((1*N+n)*H+y)*IW+x);
+}
+for (0 <= y < H, 0 <= x < W, 0 <= m < H, 0 <= n < W) {
+  weight_real[y, x, m, n] = real(exp(2*pi*i*((m*h/H) + (n*w/W))));
+  weight_imag[y, x, m, n] = imag(exp(2*pi*i*((m*h/H) + (n*w/W))));
+}
+----
+
+==== REDUCE_SUM
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`tensor_size(shape) >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = shape1[axis];
+for (index in shape1) {
+  input[index] = tosa_mi_data(S, KS, 0, index[axis], tensor_index_to_offset(index));
+}
+for (0 <= c < KS) {
+  weight[c] = 1;
+}
+----
+
+==== AVG_POOL2D
+
+The following generates input test data for test set S.
+For compliant implementation, the test must pass whenever the attributes satisfy:
+`N*OH*OW*C >= MIN_DOT_PRODUCTS`
+
+[source,c++]
+----
+KS = KY*KX;
+for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= c < C) {
+  input [ n, iy, ix, c] = tosa_mi_data(S, KS, 0, ((iy % KH)*KW+(ix % KW))*C+c, ((n*IH+iy)*IW+ix)*C+c);
+}
+for (0 <= ky < KH, 0 <= kx < KW, 0 <= c < C, 0 <= m < M) {
+  weight[ky, kx] = 1/KS;
+}
+----
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index f3a6454..66bc9bf 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -135,7 +135,10 @@ The TOSA specification is a work in progress.
 === Compliance
 
 This section defines when a TOSA implementation is compliant to a given TOSA specification profile and level.
-The term conformant will mean the same as compliant.
+To be compliant an implementation must achieve the results and accuracy defined by this specification.
+TOSA also defines a set of conformance tests.
+A compliant implementation must pass the conformance tests.
+The conformance tests are not exhaustive, so an implementation that passes the conformance tests may not be compliant if there is a non-compliance that is undetected by the tests.
 
 ==== Base Inference Profile Compliance
 
@@ -177,7 +180,7 @@ bool tosa_test_compliance(tosa_graph_t graph, tosa_list_t input_list, tosa_level
 }
 ----
 
-==== Main Inference Profile
+==== Main Inference Profile Compliance
 
 A Main Inference compliant implementation must satisfy the following:
 
@@ -216,7 +219,7 @@ The following criteria apply to all operations:
 | Operation | Accuracy bound
 
 | <<ARGMAX>>, <<MAX_POOL2D>>, <<CLAMP>>, <<MAXIMUM>>, <<MINIMUM>>, <<ABS>>, <<NEGATE>>, , <<CONST>>, <<IDENTITY>>
-| The result must be exact.
+| Non NaN results must be exact.
 
 | <<EQUAL>>, <<GREATER>>, <<GREATER_EQUAL>>
 | The result must be exact with: +
@@ -228,19 +231,25 @@ The following criteria apply to all operations:
 The dot product must meet the <<Dot product accuracy requirements>>
 
 | <<FFT2D>>, <<RFFT2D>>
-| Each output can be expressed as a dot product of an input vector with a costant vector. +
+| Each output can be expressed as a dot product of an input vector with a constant coefficient vector. +
 The dot product must meet the <<Dot product accuracy requirements>>
 
-| <<ADD>>, <<MUL>>, <<SUB>>, <<CEIL>>, <<FLOOR>>, <<CAST>>
+| <<ADD>>, <<MUL>>, <<SUB>>, <<CEIL>>, <<FLOOR>>
 | Floating-point result overflows must be set to infinity of the correct sign. +
 Floating-point result underflows must be set to zero of the correct sign. +
-Integer result overflows must be saturated. +
 Addition of infinites of different signs must produce a NaN. +
 Subtraction of infinities of the same sign must produce a NaN. +
 Multiplication of an infinity by a zero must produce a NaN. +
 Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. +
 Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result.
 
+| <<CAST>>
+| Floating-point result overflows must be set to infinity of the correct sign. +
+Floating-point result underflows must be set to zero of the correct sign. +
+Cast from floating-point to integer result overflows must be saturated. +
+Otherwise for fp32_t the result must be rounded to the nearest representable value using the round to nearest, ties to even rounding mode. +
+Otherwise for fp16_t and bf16_t the result must be within 0.5 ulp of the mathematical result.
+
 | <<RECIPROCAL>>
 | If the input is a zero or the result overlows the output must be an infinity of the same sign. +
 If the input is an infinty or the result underflows the output must be a zero of the same sign. +
@@ -264,7 +273,7 @@ Otherwise the result must be within 5 ulp of the mathematical result.
 This dot product must meet the <<Dot product accuracy requirements>>
 
 | <<AVG_POOL2D>>
-| Each output can be expressed as a dot product of an input vector with a vector with elements 1/d where d is the kernel size. +
+| Each output can be expressed as a dot product of an input vector with a vector with elements 1/KS where KS is the kernel size. +
 This dot product must meet the <<Dot product accuracy requirements>>
 
 | <<REDUCE_PRODUCT>>
@@ -277,36 +286,65 @@ where `E = pow(1 + pow(2, -M-1), N) - 1`. In this expression M is the number of
 
 ===== Dot product accuracy requirements
 
-This section gives accuracy constraints for operations where the result is a sum of products of N floating-point inputs:
-
-`y = x[0] * w[0] + x[1] * w[1] + ... + x[N-1] * w[N-1]`
-
-Let M be the number of mantissa bits in the accumulator.
-So M=23 for an `fp32_t` accumulator and M=10 for an `fp16_t` accumulator.
-
-In this section "fp64 arithmetic" refers to double-precision floating-point arithmetic defined by <<Other publications>>[1].
-
-Appendix A, defines a number of <<Dot product floating-point test data sets>>.
-For each data test set (S, N) consisting of T tests the following must hold:
-
-* For each test t in the range 0 to T-1, calculate:
-** `y_imp[t] = x[0] * w[0] + ... + x[N-1] * w[N-1]` calculated by the implementation
-** `y_ref[t] = x[0] * w[0] + ... + x[N-1] * w[N-1]` calculated using fp64 arithmetic
-** `y_bnd[t] = abs(x[0] * w[0]) + ... + abs(x[N-1] * w[N-1])` calculated using fp64 arithmetic
-* if `y_bnd[t] == 0` then
-** `y_imp[t]` must be zero and set `y_err[t] = 0`
-* if `y_bnd[t] > 0` then set:
-** `y_err[t] = (y_imp[t] - y_ref[t]) * (1<<(M+1)) / y_bnd[t]` calculated using fp64 arithmetic
-* For each test t the following must be satisfied:
-** `y_ref[t], y_bnd[t], y_imp[t]` must be finite
-** `abs(y_err[t]) \<= N`
-* Calculate the sum of y_err using fp64 arithmetic:
-** `y_err_sum   = y_err[0] + .... + y_err[T-1]`
-* Calculate the sum of y_err squared using fp64 arithmetic:
-** `y_err_sumsq = y_err[0] * y_err[0] + ... + y_err[T-1] * y_err[T-1]`
-* The error sum and sum squares must satisfy the following. The first equation bounds the bias and the second the error variance.
-** `abs(y_err_sum) \<= 2*sqrt(N*T)`
-** `y_err_sumsq \<= 0.4*N*T`
+This section assumes an operation acting on two tensors named 'input' and 'weight'.
+Each output tensor element can be expressed as a dot product of elements between the input and weight tensors.
+The dot product has length KS, the kernel size.
+Note: KS is defined for each relevant operator in the appendix section <<Main Inference operator test data>>.
+
+In other words each output element `out` can be expressed as a dot product between input elements `in[k]` and weight elements `w[k]`:
+
+`out = in[0] * w[0] + in[1] * w[1] + ... + in[KS-1] * w[KS-1]`
+
+The positions of `in[k]` and `w[k]` in the input and weight tensors depends on the operation being performed (for example a convolution).
+
+This section defines the accuracy required for these operations.
+The term "fp64 arithmetic" refers to double-precision floating-point arithmetic defined by <<Other publications>>[1].
+
+For an operation with given sizes and attributes to be compliant the following must hold for each data set S defined in <<Appendix A>>:
+
+* Let input be the input tensor generated by <<Main Inference operator test data>> for test set S
+* Let weight be the weight tensor generated by <<Main Inference operator test data>> for test set S
+* Let output_ref be the output tensor calculated by the operation using fp64 arithemic
+* Let output_imp be the output tensor calculated by the implementation to test
+* Let input_abs  be the input  tensor with each element replaced with its absolute value
+* Let weight_abs be the weight tensor with each element replaced with its absolute value
+* Let output_bnd be the output tensor calculated using fp64 arithmetic on input_abs and weight_abs
+
+The following checks must then pass:
+
+[source,c++]
+----
+size_t T = tensor_size(output_shape)  // number dot product results
+fp64_t out_err_sum = 0.0;
+fp64_t out_err_sumsq = 0.0;
+fp64_t acc_prec;  // 1<<(M+1) where M is the number of mantissa bits
+switch (acc_t) {
+    case fp32_t: acc_prec = (fp64_t)(1<<24); break;
+    case fp16_t: acc_prec = (fp64_t)(1<<11); break;
+    default: ERROR_IF(true);
+}
+for_each(index in output_shape) {
+    fp64_t out_bnd = tensor_read<fp64_t>(output_bnd, output_shape, index);
+    fp64_t out_ref = tensor_read<fp64_t>(output_ref, output_shape, index);
+    acc_t  out_imp = tensor_read<acc_t> (output_imp, output_shape, index);
+    fp64_t out_err;
+    if (out_bnd == 0.0) {
+        REQUIRE(out_ref == 0.0 && out_imp == 0.0);
+        out_err = 0.0;
+    } else {  // out_bnd > 0.0
+        out_err = ((fp64_t)out_imp - out_ref)*acc_prec/out_bnd;
+        REQUIRE(abs(out_err) <= KS);
+    }
+    out_err_sum   += out_err;
+    out_err_sumsq += out_err * out_err;
+}
+if (S!=1 && S!=2) {
+    // check output error bias magnitude for data sets S which are not positive biased
+    REQUIRE(abs(out_err_sum) <= 2*sqrt(KS*T));
+}
+// check output error variance magnitude
+REQUIRE(out_err_sumsq <= 0.4*KS*T)
+----
 
 === Tensor Definitions
 
diff --git a/tools/dictionary.dic b/tools/dictionary.dic
index 325db9b..da2e28c 100644
--- a/tools/dictionary.dic
+++ b/tools/dictionary.dic
@@ -1,6 +1,7 @@
 personal_ws-1.1 en 500
 activations
 adoc
+acc
 ARGMAX
 AsciiDoc
 BILINEAR
@@ -44,6 +45,7 @@ MUL
 multipler
 NaN
 NPUs
+OC
 pre
 precisions
 pseudocode
-- 
cgit v1.2.1