3 files changed, 120 insertions, 30 deletions
diff --git a/chapters/appendix_a.adoc b/chapters/appendix_a.adoc
index d4235e8..ba3b6bb 100644
--- a/chapters/appendix_a.adoc
+++ b/chapters/appendix_a.adoc
@@ -37,7 +37,10 @@ This function takes the following arguments:
 
 * S is the test set number which identifies which generator is used
 * KS is the kernel size
-* p is the parameter number of 0 for the first input (usually data) and 1 for the second input (usually weights)
+* p is the parameter number of:
+** 0 for the first input (usually data)
+** 1 for the second input (usually weights)
+** 2 for the third input if present (usually bias)
 * k is the index within the kernel in the range 0 \<= k < KS
 * i is the index within the tensor to write
 
@@ -61,6 +64,7 @@ The aim of this generator is to check that sum of products with zero gives zero
 | p | tosa_mi_data(S, KS, p, k, i) =
 | 0 | set_data(2*S, i) < 0 ? 0.0 : set_data(2*S+1, i)
 | 1 | set_data(2*S, i) < 0 ? set_data(2*S+1, i) : 0.0
+| 2 | 0.0
 |===
 
 ==== Test set S=1
@@ -70,8 +74,9 @@ The aim of this test set is to check values with large exponents.
 [cols="1,9"]
 |===
 | p | tosa_mi_data(S, KS, p, k, i) =
-| 0 | (B/sqrt(KS))*(0.75 + 0.25*set_data(2*S+0, i))
-| 1 | (B/sqrt(KS))*(0.75 + 0.25*set_data(2*S+1, i))
+| 0 | (B/sqrt(KS+1))*(0.75 + 0.25*set_data(3*S+0, i))
+| 1 | (B/sqrt(KS+1))*(0.75 + 0.25*set_data(3*S+1, i))
+| 2 | (B*B/(KS+1))*(0.75 + 0.25*set_data(3*S+2, i))
 |===
 
 ==== Test set S=2
@@ -85,6 +90,7 @@ If the implementation changes the order of the sum, then the test data must also
 | p | tosa_mi_data(S, KS, p, k, i) =
 | 0 | (k==0) ? 1.0 : set_data(2*S+0, i)/sqrt(KS)
 | 1 | (k==0) ? 1.0 : set_data(2*S+1, i)/sqrt(KS)
+| 2 | 0.0
 |===
 
 ==== Test set S=3
@@ -98,6 +104,7 @@ If the implementation changes the order of the sum, then the test data must also
 | p | tosa_mi_data(S, KS, p, k, i) =
 | 0 | (k==0) ? 16.0 : exp(2*set_data(2*S+0, 2*i+0)) * set_data(2*S+0, 2*i+1)
 | 1 | (k==0) ? 16.0 : exp(2*set_data(2*S+1, 2*i+0)) * set_data(2*S+1, 2*i+1)
+| 2 | 0.0
 |===
 
 ==== Test set S=4
@@ -109,6 +116,7 @@ The aim of this test set is to check a mixture of zero and non-zero products.
 | p | tosa_mi_data(S, KS, p, k, i) =
 | 0 | (k==KS/2) ? +0.5 : (set_data(2*S, i) < 0 ? 0.0 : (B/sqrt(KS))*set_data(2*S+1, i))
 | 1 | (k==KS/2) ? -0.5 : (set_data(2*S, i) < 0 ? (B/sqrt(KS))*set_data(2*S+1, i) : 0.0)
+| 2 | 0.0
 |===
 
 ==== Test set S=5
@@ -118,8 +126,9 @@ The aim of this test set is to check signed inputs of large range.
 [cols="1,9"]
 |===
 | p | tosa_mi_data(S, KS, p, k, i) =
-| 0 | (B/sqrt(KS))*set_data(2*S+0, i)
-| 1 | (B/sqrt(KS))*set_data(2*S+1, i)
+| 0 | (B/sqrt(KS+1))*set_data(3*S+0, i)
+| 1 | (B/sqrt(KS+1))*set_data(3*S+1, i)
+| 2 | (B*B/(KS+1))*set_data(3*S+2, i)
 |===
 
 === Main Inference operator test data
@@ -145,6 +154,9 @@ for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= ic < IC) {
 for (0 <= oc < OC, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
   weight[oc, ky, kx, ic] = tosa_mi_data(S, KS, 1, (ky*KW+kx)*IC+ic, ((oc*KH+ky)*KW+kx)*IC+ic);
 }
+for (0 <= oc < OC) {
+  bias[oc] = tosa_mi_data(S, KS, 2, oc)
+}
 ----
 
 ==== CONV3D
@@ -162,6 +174,9 @@ for (0 <= n < N, 0 <= id < UD, 0 <= iy < IH, 0 <= ix < IW, 0 <= ic < IC) {
 for (0 <= oc < OC, 0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
   weight[oc, kd, ky, kx, ic] = tosa_mi_data(S, KS, 1, ((kd*KH+ky)*KW+kx)*IC+ic, (((oc*KD+kd)*KH+ky)*KW+kx)*IC+ic);
 }
+for (0 <= oc < OC) {
+  bias[oc] = tosa_mi_data(S, KS, 2, oc)
+}
 ----
 
 ==== DEPTHWISE_CONV2D
@@ -179,6 +194,9 @@ for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= c < C) {
 for (0 <= ky < KH, 0 <= kx < KW, 0 <= c < C, 0 <= m < M) {
   weight[ky, kx,  c, m] = tosa_mi_data(S, KS, 1, (ky*KW+kx), ((ky*KW+kx)*C+c)*M+m);
 }
+for (0 <= oc < C*M) {
+  bias[oc] = tosa_mi_data(S, KS, 2, oc)
+}
 ----
 
 ==== FULLY_CONNECTED
@@ -196,6 +214,9 @@ for (0 <= n < N, 0 <= ic < IC) {
 for (0 <= oc < OC, 0 <= ic < IC) {
   weight[oc, ic] = tosa_mi_data(S, KS, 1, ic, oc*IC+ic);
 }
+for (0 <= oc < OC) {
+  bias[oc] = tosa_mi_data(S, KS, 2, oc)
+}
 ----
 
 ==== MATMUL
@@ -230,6 +251,9 @@ for (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= ic < IC) {
 for (0 <= oc < OC, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
   weight[oc, ky, kx, ic] = tosa_mi_data(S, KS, 1, (ky*KW+kx)*IC+ic, ((oc*KH+ky)*KW+kx)*IC+ic);
 }
+for (0 <= oc < OC) {
+  bias[oc] = tosa_mi_data(S, KS, 2, oc)
+}
 ----
 
 ==== FFT2D
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index 0765e95..d6f7bf9 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -287,64 +287,90 @@ where `E = pow(1 + pow(2, -M-1), N) - 1`. In this expression M is the number of
 
 ===== Dot product accuracy requirements
 
-This section assumes an operation acting on two tensors named 'input' and 'weight'.
-Each output tensor element can be expressed as a dot product of elements between the input and weight tensors.
+This section assumes an operation acting on tensors named 'input', 'weight' and optionally 'bias'.
+Each output tensor element can be expressed as a dot product of elements between the 'input' and 'weight' tensors with optional bias addition.
 The dot product has length KS, the kernel size.
+If the operation does not specify a bias then 'bias' is taken to be zero in this section.
 Note: KS is defined for each relevant operator in the appendix section <<Main Inference operator test data>>.
 
-In other words each output element `out` can be expressed as a dot product between input elements `in[k]` and weight elements `w[k]`:
+In other words, each output element `out` can be expressed as a dot product between input elements `in[k]`, weight elements `w[k]`, bias `b`:
 
-`out = in[0] * w[0] + in[1] * w[1] + ... + in[KS-1] * w[KS-1]`
+`out = in[0] * w[0] + in[1] * w[1] + ... + in[KS-1] * w[KS-1] + b`
 
-The positions of `in[k]` and `w[k]` in the input and weight tensors depends on the operation being performed (for example a convolution).
+The positions of `in[k]`, `w[k]`, `b` in the input, weight and bias tensors depends on the operation being performed.
+This may be, for example, a convolution.
 
 This section defines the accuracy required for these operations.
-The term "fp64 arithmetic" refers to double-precision floating-point arithmetic defined by <<Other publications>>[1].
+In this section:
 
-For an operation with given sizes and attributes to be compliant the following must hold for each data set S defined in <<Appendix A>>:
+* "fp64 arithmetic" refers to double-precision floating-point arithmetic defined by IEEE 754 (<<Other publications>>[1])
+* `operation_fp64()` is an fp64 reference implementation of the operation
+* `operation_imp()` is the implementation under test
+* `local_bound` is defined as follows:
+**  For operations with a local_bound attribute it is the value of the optional attribute, with default value of false
+**  For operations that do not have a local_bound attribute the value is true
 
-* Let input be the input tensor generated by <<Main Inference operator test data>> for test set S
-* Let weight be the weight tensor generated by <<Main Inference operator test data>> for test set S
-* Let output_ref be the output tensor calculated by the operation using fp64 arithemic
-* Let output_imp be the output tensor calculated by the implementation to test
-* Let input_abs  be the input  tensor with each element replaced with its absolute value
-* Let weight_abs be the weight tensor with each element replaced with its absolute value
-* Let output_bnd be the output tensor calculated using fp64 arithmetic on input_abs and weight_abs
+The checks described in the following code must pass for the following data sets:
 
-The following checks must then pass:
+* Data sets defined for the operation in Appendix A <<Main Inference operator test data>>.
+* Data sets that have at least MIN_DOT_PRODUCT different output values. For these data sets we take S=-1.
 
 [source,c++]
 ----
+output_ref = operation_fp64(input, weight, bias);
+output_imp = operation_imp (input, weight, bias);
+input_abs  = abs(input);   // Element-wise absolute
+weight_abs = abs(weight);  // Element-wise absolute
+bias_abs   = abs(bias);    // Element-wise absolute
+if (!local_bound) {
+    input_abs_max = max_value(input_abs);  // maximum over all elements
+    for_each(index in shape(input_abs) {
+        input_abs[index] = input_abs_max;  // set all entries to global maximum
+    }
+}
+output_bnd = operation_fp64(input_abs, weight_abs, bias_abs);
+
 size_t T = tensor_size(output_shape)  // number dot product results
+size_t ksb = (max_value(bias_abs) > 0) ? (KS + 1) : KS; // kernel size and bias
 fp64_t out_err_sum = 0.0;
 fp64_t out_err_sumsq = 0.0;
-fp64_t acc_prec;  // 1<<(M+1) where M is the number of mantissa bits
+fp64_t acc_prec;       // 1<<(M+1) where M is the number of mantissa bits
+fp64_t acc_min_normal; // accumulator minimum normal greater than zero
+fp64_t two_m63 = -1.0/(fp64)((int64_t)-1<<63);         // pow(2,  -63)
 switch (acc_t) {
-    case fp32_t: acc_prec = (fp64_t)(1<<24); break;
-    case fp16_t: acc_prec = (fp64_t)(1<<11); break;
-    default: ERROR_IF(true);
+    case fp32_t: acc_prec       = (fp64_t)(1<<24);     // pow(2,   24)
+                 acc_min_normal = two_m63 * two_m63;   // pow(2, -126)
+                 break;
+    case fp16_t: acc_prec       = (fp64_t)(1<<11);     // pow(2,   11)
+                 acc_min_normal = 1.0/(fp64_t)(1<<14); // pow(2,  -14)
+                 break;
+    default:     ERROR_IF(true);
 }
 for_each(index in output_shape) {
     fp64_t out_bnd = tensor_read<fp64_t>(output_bnd, output_shape, index);
     fp64_t out_ref = tensor_read<fp64_t>(output_ref, output_shape, index);
     acc_t  out_imp = tensor_read<acc_t> (output_imp, output_shape, index);
     fp64_t out_err;
-    if (out_bnd == 0.0) {
+    if ((acc_t)out_bnd == infinity) {
+        // dot product can overflow and there is no accuracy limit
+        out_err = 0.0;
+    } else if (out_bnd == 0.0) {
         REQUIRE(out_ref == 0.0 && out_imp == 0.0);
         out_err = 0.0;
-    } else {  // out_bnd > 0.0
+    } else {  // 0.0 < out_bnd < infinity
+        out_bnd = max(out_bnd, acc_min_normal);
         out_err = ((fp64_t)out_imp - out_ref)*acc_prec/out_bnd;
-        REQUIRE(abs(out_err) <= KS);
+        REQUIRE(abs(out_err) <= ksb);
     }
     out_err_sum   += out_err;
     out_err_sumsq += out_err * out_err;
 }
-if (S!=1 && S!=2) {
+if (input and weights are data set S with 3 <= S <= 5) {
     // check output error bias magnitude for data sets S which are not positive biased
-    REQUIRE(abs(out_err_sum) <= 2*sqrt(KS*T));
+    REQUIRE(abs(out_err_sum) <= 2*sqrt(ksb*T));
 }
 // check output error variance magnitude
-REQUIRE(out_err_sumsq <= 0.4*KS*T)
+REQUIRE(out_err_sumsq <= 0.4*ksb*T)
 ----
 
 === Tensor Definitions
diff --git a/tosa.xml b/tosa.xml
index d3889a2..53f8000 100644
--- a/tosa.xml
+++ b/tosa.xml
@@ -159,6 +159,14 @@
             <description>Weight zero point. Must be zero for non-int8 types.</description>
             <rank min="0" max="0"/>
           </argument>
+          <argument category="attribute" name="local_bound" type="tensor_t" shape="-" tensor-element-type="bool_t" optional="true">
+            <description>
+                This optional attribute affects the floating-point compliance error bound.
+                The default of false allows for direct and transform based, fast convolution algorithms.
+                Only set to true if direct dot-product calculation precision is required.
+            </description>
+            <rank min="0" max="0"/>
+          </argument>
           <argument category="output" name="output" type="tensor_t" shape="[N,OH,OW,OC]" tensor-element-type="out_t">
             <description>Output tensor</description>
             <rank min="4" max="4"/>
@@ -236,6 +244,14 @@
             <description>Weight zero point. Must be zero for non-int8 types.</description>
             <rank min="0" max="0"/>
           </argument>
+          <argument category="attribute" name="local_bound" type="tensor_t" shape="-" tensor-element-type="bool_t" optional="true">
+            <description>
+                This optional attribute affects the floating-point compliance error bound.
+                The default of false allows for direct and transform based, fast convolution algorithms.
+                Only set to true if direct dot-product calculation precision is required.
+            </description>
+            <rank min="0" max="0"/>
+          </argument>
           <argument category="output" name="output" type="tensor_t" shape="[N,OD,OH,OW,OC]" tensor-element-type="out_t">
             <description>Output tensor</description>
             <rank min="5" max="5"/>
@@ -309,6 +325,14 @@
             <description>Weight zero point. Must be zero for non-int8 types.</description>
             <rank min="0" max="0"/>
           </argument>
+          <argument category="attribute" name="local_bound" type="tensor_t" shape="-" tensor-element-type="bool_t" optional="true">
+            <description>
+                This optional attribute affects the floating-point compliance error bound.
+                The default of false allows for direct and transform based, fast convolution algorithms.
+                Only set to true if direct dot-product calculation precision is required.
+            </description>
+            <rank min="0" max="0"/>
+          </argument>
           <argument category="output" name="output" type="tensor_t" shape="[N,OH,OW,C*M]" tensor-element-type="out_t">
             <description>Output tensor</description>
             <rank min="4" max="4"/>
@@ -360,6 +384,14 @@
             <description>Real part of the complex output.</description>
             <rank min="3" max="3"/>
           </argument>
+          <argument category="attribute" name="local_bound" type="tensor_t" shape="-" tensor-element-type="bool_t" optional="true">
+            <description>
+                This optional attribute affects the floating-point compliance error bound.
+                The default of false allows for direct and transform based, fast convolution algorithms.
+                Only set to true if direct dot-product calculation precision is required.
+            </description>
+            <rank min="0" max="0"/>
+          </argument>
           <argument category="output" name="output_imag" type="tensor_t" shape="[N,H,W]" tensor-element-type="in_out_t">
             <description>Imaginary part of the complex output.</description>
             <rank min="3" max="3"/>
@@ -592,6 +624,14 @@
             <description>Weight zero point. Must be zero for non-int8 types.</description>
             <rank min="0" max="0"/>
           </argument>
+          <argument category="attribute" name="local_bound" type="tensor_t" shape="-" tensor-element-type="bool_t" optional="true">
+            <description>
+                This optional attribute affects the floating-point compliance error bound.
+                The default of false allows for direct and transform based, fast convolution algorithms.
+                Only set to true if direct dot-product calculation precision is required.
+            </description>
+            <rank min="0" max="0"/>
+          </argument>
           <argument category="output" name="output" type="tensor_t" shape="[N,OH,OW,OC]" tensor-element-type="out_t">
             <description>Output tensor</description>
             <rank min="4" max="4"/>