2 files changed, 8 insertions, 2 deletions
diff --git a/chapters/ewise_binary.adoc b/chapters/ewise_binary.adoc
index 4da63de..241ca32 100644
--- a/chapters/ewise_binary.adoc
+++ b/chapters/ewise_binary.adoc
@@ -479,6 +479,7 @@ Elementwise multiplication (Hadamard product) of input tensor 0 and input tensor
 
 |Input|in_t*|input1|shape1|Input tensor
 |Input|in_t*|input2|shape2|Input tensor with the same rank as Input 0
+|Attribute|uint6_t|shift|-|Result right shift (int32 data type only)
 |Output|out_t*|output|shape|Output tensor with broadcast shape if necessary
 |===
 
@@ -486,12 +487,17 @@ Elementwise multiplication (Hadamard product) of input tensor 0 and input tensor
 
 [source,c]
 ----
+assert(in_t==int32_t || shift==0);
 for_each (index in shape) {
     index1 = apply_broadcast(shape, shape1, index)
     index2 = apply_broadcast(shape, shape2, index)
     in_t value1 = tensor_read<in_t>(input1, shape1, index1)
     in_t value2 = tensor_read<in_t>(input2, shape2, index2)
-    in_t acc = value1 * value2 // takes low bits for int32_t
+    if (shift>0) {
+        out_t acc = apply_scale_32(value1, value2, shift)
+    } else {
+        out_t acc = value1 * value2;  // low 32-bits of result for int32_t
+    }
     tensor_write<out_t>(output, shape, index, acc)
 }
 ----
diff --git a/chapters/introduction.adoc b/chapters/introduction.adoc
index 09a21dd..5134330 100644
--- a/chapters/introduction.adoc
+++ b/chapters/introduction.adoc
@@ -220,7 +220,7 @@ Most operations in TOSA do not contain quantization scaling in the operation, bu
 The apply_scale functions provide a scaling of approximately (multiplier * 2^-shift^). The shift range is limited to allow a variety of implementations. The upper limit of 62 allows it to be decomposed as two right shifts of 31. The lower limit removes special cases in the rounding. These restrictions have little practical impact since the shift value to achieve a scaling of 1.0 is 30 for apply_scale_32 with multiplier=1<<30 and 14 for apply_scale_16 with scale=1<<14. It follows that a scaling range of 2^+12^ down to 2^-32^ is supported for both functions with normalized multiplier. (Smaller scales can be obtained by denormalizing the multiplier).
 
 ....
-int32_t apply_scale_32(int32_t value, int32_t multipler, uint6_t shift, bool double_round) {
+int32_t apply_scale_32(int32_t value, int32_t multipler, uint6_t shift, bool double_round=false) {
   assert(multiplier >= 0);
   assert(2 <= shift && shift <= 62);
   int64_t round = 1 << (shift - 1);