3 files changed, 20 insertions, 5 deletions
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index faae2cf3..5fa71aa7 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -27,7 +27,7 @@ from .ethos_u55_regs.ethos_u55_regs import *
 from .tensor import MemArea, TensorBlockTraversal
 from .operation import NpuBlockType
 from .numeric_util import quantise_float32, round_up, round_away_zero, round_up_to_int, clamp_sigmoid, clamp_tanh
-from .data_type import BaseType
+from .data_type import BaseType, DataType
 import numpy as np
 from .shared_buffer_allocation import SharedBufferAllocation
 from .architecture_features import SharedBufferArea, SHRAMElements, ArchitectureFeatures
@@ -615,6 +615,9 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
 
                 else:  # Convolution
                     assert cmd.weight_tensor.block_traversal != TensorBlockTraversal.Default
+                    # Reduced precision quantization and natural rounding used for int16
+                    if cmd.ifm_tensor.dtype == DataType.int16:
+                        rounding_mode = rounding.NATURAL
                     emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1)
                     emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1)
                     if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
diff --git a/ethosu/vela/scaling.py b/ethosu/vela/scaling.py
index b255f938..785cddc6 100644
--- a/ethosu/vela/scaling.py
+++ b/ethosu/vela/scaling.py
@@ -42,6 +42,15 @@ def quantise_scale(scale):
     return significand_q31, shift
 
 
+# Reduced precision quantization for int16
+def reduced_quantise_scale(scale):
+    multiplier, shift = quantise_scale(scale)
+    reduced_multiplier = int((multiplier + (1 << 15)) >> 16)
+    reduced_shift = shift - 16
+
+    return reduced_multiplier, reduced_shift
+
+
 # Calculate global OFM scale for Average Pooling
 def quantise_pooling_scale(nr_kernel_elements, rescale_bits=0):
     _, k = math.frexp(nr_kernel_elements - 1)
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 0b4ac696..92197248 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -25,7 +25,7 @@ import math
 import numpy as np
 from collections import namedtuple
 from .numeric_util import round_up
-from .scaling import quantise_scale
+from .scaling import quantise_scale, reduced_quantise_scale
 from .tensor import TensorPurpose, TensorSubPurpose, TensorFormat, TensorBlockTraversal
 from .operation import NpuBlockType
 from .architecture_features import Block
@@ -287,7 +287,7 @@ def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):
     if not rescale_for_faf:
         if ifm_dtype == DataType.uint8:
             scales = [np.double(ifm_scale * weight_scale) / np.double(ofm_scale) for weight_scale in weight_scales]
-        elif ifm_dtype == DataType.int8:
+        elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
             scales = [
                 (np.double(ifm_scale) * np.double(weight_scale)) / np.double(ofm_scale)
                 for weight_scale in weight_scales
@@ -297,13 +297,16 @@ def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):
     else:
         if ifm_dtype == DataType.uint8:
             scales = [np.double(ifm_scale * weight_scale * 0x3000) for weight_scale in weight_scales]
-        elif ifm_dtype == DataType.int8:
+        elif ifm_dtype == DataType.int8 or ifm_dtype == DataType.int16:
             scales = [(np.double(ifm_scale * 0x3000) * np.double(weight_scale)) for weight_scale in weight_scales]
         else:
             assert False, str(ifm_dtype) + " not implemented"
 
     # quantise all of the weight scales into (scale_factor, shift)
-    quantised_scales = [quantise_scale(scale) for scale in scales]
+    if ifm_dtype == DataType.int16:
+        quantised_scales = [reduced_quantise_scale(scale) for scale in scales]
+    else:
+        quantised_scales = [quantise_scale(scale) for scale in scales]
 
     for _, shift in quantised_scales:
         assert shift >= 16