From 17e53b5d776109e1bd1073c657ff0453ccf3c09e Mon Sep 17 00:00:00 2001
From: Rickard Bolin <rickard.bolin@arm.com>
Date: Tue, 6 Sep 2022 16:09:01 +0000
Subject: MLBEDSW-6927: Add ofm_stride_multiplier attribute to operation

Allow sparse writing of OFM by multiplying H/W/C of the OFM with the
values of ofm_stride_multiplier

Signed-off-by: Rickard Bolin <rickard.bolin@arm.com>
Change-Id: I65d742ad36ad3154e9914cdd22e2da928ad1f095
---
 ethosu/vela/high_level_command_to_npu_op.py |  31 ++++++--
 ethosu/vela/operation.py                    |   5 ++
 ethosu/vela/tensor.py                       | 119 +++++++++++++++-------------
 3 files changed, 94 insertions(+), 61 deletions(-)

(limited to 'ethosu/vela')

diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 18919431..6246b37e 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -347,7 +347,13 @@ def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
     return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
 
 
-def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
+def create_feature_map(
+    tens: Tensor,
+    box: Box,
+    arch: ArchitectureFeatures,
+    op_shape4D: Shape4D,
+    stride_multiplier: Optional[List[int]] = None,
+) -> NpuFeatureMap:
     """Creates feature map with common fields populated"""
     fm = NpuFeatureMap()
     fm.region = get_region(tens.mem_type, arch)
@@ -358,13 +364,25 @@ def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_sh
         fm.layout = NpuLayout.NHCWB16
     else:
         assert 0, "Incorrect tensor format"
+
+    strides = tens.get_strides(op_shape4D)
+    assert strides is not None
+
+    if stride_multiplier and stride_multiplier != [1, 1, 1]:
+        assert (
+            tens.format == TensorFormat.NHWC
+        ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format"
+        # Multiply strides for C/H/W (in that order) with corresponding stride factor
+        for i, stride_factor in enumerate(stride_multiplier, start=1):
+            strides[i] *= stride_factor
+
     height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
-        box.start_coord, box.end_coord, op_shape4D
+        box.start_coord, box.end_coord, strides, op_shape4D
     )
+
     fm.tiles = NpuTileBox(
         height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
     )
-    strides = tens.get_strides(shape4D=op_shape4D)
     fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
     fm.name = tens.name
     return fm
@@ -462,7 +480,7 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit
     npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
 
     out_block = cmd.ofm_box.get_block()
-    npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
+    npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.ofm_stride_multiplier)
     npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
     npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
 
@@ -595,9 +613,8 @@ def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
                     src_addr = cmd.in_tensor.address + weight_range.offset
                     dest_addr = cmd.out_tensor.address
     else:
-        start_coord = cmd.box.start_coord
-        src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
-        dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+        src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord)
+        dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord)
         sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
     src = NpuAddressRange(src_region, int(src_addr), int(sz))
     dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index de68b1d7..e1622049 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -501,6 +501,7 @@ class Operation:
         "write_offset",
         "write_shape",
         "ifm_resampling_mode",
+        "ofm_stride_multiplier",
     )
 
     def __init__(self, op_type: Op, name: str):
@@ -545,6 +546,9 @@ class Operation:
         # write_offset 0,9,0,0, write_shape 1,1,8,1
         self.write_shape: Optional[Shape4D] = None
         self.ifm_resampling_mode: resampling_mode = resampling_mode.NONE
+        # For interleaved/sparse outputs - stride is multiplied with the stride factor of the corresponding axis
+        # Order is [C, H, W] - default is no multiplication
+        self.ofm_stride_multiplier: List[int] = [1, 1, 1]
 
     def clone(self, suffix="_clone"):
         res = Operation(self.type, self.name + suffix)
@@ -568,6 +572,7 @@ class Operation:
         res.low_precision_scaling = self.low_precision_scaling
         res.rescale = self.rescale
         res.ifm_resampling_mode = self.ifm_resampling_mode
+        res.ofm_stride_multiplier = self.ofm_stride_multiplier.copy()
 
         return res
 
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 65473b8d..99970317 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -592,7 +592,9 @@ class Tensor:
         rounding_quantum = full_shape(4, list(self.storage_rounding_quantum), 1)
         return Shape4D(shape_round_to_quantum(op_shape4D.as_list(), rounding_quantum))
 
-    def addresses_for_rolling_buffer(self, start_coord: Shape, end_coord: Shape, op_shape4D: Shape4D) -> Tuple:
+    def addresses_for_rolling_buffer(
+        self, start_coord: Shape, end_coord: Shape, strides: List[int], op_shape4D: Shape4D
+    ) -> Tuple:
         # returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] )
 
         if self.storage_shape == []:
@@ -600,7 +602,7 @@ class Tensor:
                 1,
                 1,
                 1,
-                [self.address_for_coordinate(start_coord, op_shape4D=op_shape4D), 0, 0, 0],
+                [self.address_for_coordinate(start_coord, strides, op_shape4D), 0, 0, 0],
             )
 
         if self.is_standard_fm:
@@ -618,89 +620,90 @@ class Tensor:
         box_width = crossing_x - start_coord[2]
 
         addresses: List = [0] * 4
-        addresses[0] = self.address_for_coordinate(start_coord, op_shape4D=op_shape4D)
+        addresses[0] = self.address_for_coordinate(start_coord, strides, op_shape4D)
 
         if end_coord[2] > crossing_x:
             addresses[1] = self.address_for_coordinate(
-                [start_coord[0], start_coord[1], crossing_x, start_coord[3]], op_shape4D=op_shape4D
+                [start_coord[0], start_coord[1], crossing_x, start_coord[3]], strides, op_shape4D
             )
             raise UnsupportedFeatureError("Striping in vertical direction is not supported")
         if end_coord[1] > crossing_y:
             addresses[2] = self.address_for_coordinate(
-                [start_coord[0], crossing_y, start_coord[2], start_coord[3]], op_shape4D=op_shape4D
+                [start_coord[0], crossing_y, start_coord[2], start_coord[3]], strides, op_shape4D
             )
         if end_coord[1] > crossing_y and end_coord[2] > crossing_x:
             addresses[3] = self.address_for_coordinate(
-                [start_coord[0], crossing_y, crossing_x, start_coord[3]], op_shape4D=op_shape4D
+                [start_coord[0], crossing_y, crossing_x, start_coord[3]], strides, op_shape4D
             )
 
         return box_height0, box_height0, box_width, addresses
 
-    def address_for_coordinate(self, coord: Shape, is_top_box: bool = False, op_shape4D: Shape4D = None) -> int:
-        offset = self.address_offset_for_coordinate(coord, op_shape4D=op_shape4D, is_top_box=is_top_box)
-        assert offset is not None
-        return self.address + offset
+    def get_strides(self, shape4D: Optional[Shape4D]) -> List[int]:
 
-    def get_strides_and_coord(
-        self, coord: Optional[Shape] = None, shape4D: Optional[Shape4D] = None
-    ) -> Tuple[Optional[Shape], Optional[Shape]]:
-        if coord is None:
-            coord = [0] * min(len(self.storage_shape), 4)
+        augmented_shape = self.get_augmented_shape(shape4D)
+        assert len(augmented_shape) == 5
+        strides: List = [0] * len(augmented_shape)
+        stride = self.element_size() * self.storage_compression_scale
+
+        if self.format != TensorFormat.NHCWB16:
+            stride_order = [4, 1, 3, 2, 0]
+            for i in stride_order:
+                strides[i] = stride
+                stride *= augmented_shape[i]
+        else:
+            assert len(strides) == 5
+            strides[4] = stride
+            strides[3] = 16 * stride  # STRIDE_X
+            strides[1] = strides[3] * augmented_shape[2]  # STRIDE_C
+            strides[2] = augmented_shape[2] * augmented_shape[3] * stride  # STRIDE_Y
+            strides[0] = strides[2] * augmented_shape[1]  # STRIDE_N
+
+        return strides
+
+    def get_augmented_shape(self, shape4D: Optional[Shape4D] = None) -> Optional[Shape]:
 
         if shape4D and self.is_standard_fm:
             augmented_shape = self.get_4D_storage_shape_for_shape(shape4D).as_list()
         else:
             augmented_shape = full_shape(4, self.storage_shape, 1)
 
-        augmented_coord = coord
+        if self.format == TensorFormat.NHWC:
+            augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1]
+
+        elif self.format == TensorFormat.NHCWB16:
+            augmented_shape = augmented_shape[0:4] + [1]
+
+            if augmented_shape[1] == 0:
+                augmented_shape[1] = 1
 
-        while len(augmented_coord) < 4:
-            augmented_coord = [0] + augmented_coord
+        else:
+            assert self.format in (TensorFormat.Unknown, TensorFormat.WeightsCompressed)
+            return None
 
-        assert len(augmented_coord) == len(augmented_shape)
+        return augmented_shape
+
+    def get_augmented_coord(self, coord: Optional[Shape] = None) -> Optional[Shape]:
+        if coord is None:
+            coord = [0] * min(len(self.storage_shape), 4)
+
+        missing_len = 4 - len(coord)
+        augmented_coord = ([0] * missing_len) + coord
 
         if self.format == TensorFormat.NHWC:
-            augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1]
             augmented_coord = [augmented_coord[0], augmented_coord[3]] + augmented_coord[1:3] + [0]
 
         elif self.format == TensorFormat.NHCWB16:
             channel_divisor = 16
-            augmented_shape = augmented_shape[0:4] + [1]
             augmented_coord = (
                 [augmented_coord[0], augmented_coord[3] // channel_divisor]
                 + augmented_coord[1:3]
                 + [augmented_coord[3] % channel_divisor]
             )
-
-            if augmented_shape[1] == 0:
-                augmented_shape[1] = 1
-
         else:
             assert self.format in (TensorFormat.Unknown, TensorFormat.WeightsCompressed)
-            return None, None
-
-        strides: List = [0] * len(augmented_shape)
-        stride = self.element_size() * self.storage_compression_scale
-
-        if self.format != TensorFormat.NHCWB16:
-            stride_order = [4, 1, 3, 2, 0]
-            for i in stride_order:
-                strides[i] = stride
-                stride *= augmented_shape[i]
-        else:
-            assert len(strides) == 5
-            strides[4] = stride
-            strides[3] = 16 * stride  # STRIDE_X
-            strides[1] = strides[3] * augmented_shape[2]  # STRIDE_C
-            strides[2] = augmented_shape[2] * augmented_shape[3] * stride  # STRIDE_Y
-            strides[0] = strides[2] * augmented_shape[1]  # STRIDE_N
-
-        return strides, augmented_coord
+            return None
 
-    def get_strides(self, shape4D: Optional[Shape4D] = None) -> Shape:
-        strides, _ = self.get_strides_and_coord(shape4D=shape4D)
-        assert strides is not None
-        return strides
+        return augmented_coord
 
     def find_npu_op(self) -> Optional[Operation]:
         # Returns the NPU operator that uses this tensor
@@ -743,8 +746,12 @@ class Tensor:
         assert 0 <= index < len(self.compressed_values)
         return index == len(self.compressed_values) - 1
 
-    def address_offset_for_coordinate(
-        self, orig_coord: Shape, op_shape4D: Optional[Shape4D] = None, is_top_box: bool = False
+    def address_for_coordinate(
+        self,
+        orig_coord: Shape,
+        strides: Optional[List[int]] = None,
+        op_shape4D: Optional[Shape4D] = None,
+        is_top_box: bool = False,
     ) -> Optional[int]:
         address_offset = 0
         assert self.purpose != TensorPurpose.Weights
@@ -771,18 +778,22 @@ class Tensor:
         # handle wraparound for partial buffers. make sure to do this after subtracting top box:
         coord = [c % storage_shape[idx] for idx, c in enumerate(coord)]
 
-        strides, augmented_coord = self.get_strides_and_coord(coord, op_shape4D)
-        if strides is None:
-            return None
+        # Strides may be passed as an argument, for example when creating feature maps as the strides may be modified
+        # by the "ofm_stride_multiplier" operation attribute. If not, they are calculated here.
+        if not strides:
+            strides = self.get_strides(op_shape4D)
 
         if is_top_box:
             address_offset += 1 * strides[-1]  # one element
 
+        augmented_coord = self.get_augmented_coord(coord)
+        assert augmented_coord is not None
+
         address_offset += np.dot(augmented_coord, strides)
 
         assert address_offset >= 0
         assert address_offset <= storage_size
-        return address_offset
+        return self.address + address_offset
 
     def is_allocated_in_tensor_arena(self, scratch_tensor_mem_area: MemArea) -> bool:
         return (self.mem_area == scratch_tensor_mem_area) and (self.mem_type in (MemType.Scratch, MemType.Scratch_fast))
-- 
cgit v1.2.1