From 897cc14968e017b1f48f376f7f7cefc515c5fe88 Mon Sep 17 00:00:00 2001
From: Andreas Nevalainen <andreas.nevalainen@arm.com>
Date: Wed, 28 Oct 2020 15:42:08 +0100
Subject: MLBEDSW-3222: Bias tensors in fast storage

For IFM streamed cascades bias tensors are read several times.
Moves these tensors to fast storage and add DMA commands.

Change-Id: I630f6275986c1b5e3f126c925b11e22500fb1128
Signed-off-by: Andreas Nevalainen <andreas.nevalainen@arm.com>
---
 OPTIONS.md                                         |  9 +++++
 ethosu/vela/high_level_command_stream.py           |  6 +++
 ethosu/vela/high_level_command_stream_generator.py |  5 +++
 ethosu/vela/npu_serialisation.py                   |  5 ++-
 ethosu/vela/register_command_stream_generator.py   |  7 +++-
 ethosu/vela/scheduler.py                           | 45 ++++++++++++++++++++++
 ethosu/vela/tensor.py                              |  9 +++--
 ethosu/vela/vela.py                                |  4 ++
 ethosu/vela/weight_compressor.py                   |  8 +++-
 9 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/OPTIONS.md b/OPTIONS.md
index a7b513c..9220151 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -72,6 +72,15 @@ system's SRAM this optimisation is required.
 vela network.tflite --cascading False
 ```
 
+### Keep scale placement
+
+Prevents scheduler from placing scale tensors for IFM streamed passes in SRAM
+and keeps these in flash.  
+
+```bash
+vela network.tflite --keep-scale-placement
+```
+
 ### Force Block Config
 
 Force a specific block configuration in the format HxWxC, where H, W, and C are
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index b8a19f5..a5372d7 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from .numeric_util import round_up_divide
 from .operation import NpuBlockType
+from .operation import Op
 from .range_set import AccessDirection
 from .range_set import MemoryAccessSet
 from .range_set import MemoryRangeSet
@@ -236,6 +237,11 @@ class NpuStripe(Command):
                 ),
                 AccessDirection.Read,
             )
+        if self.scale_tensor is not None and self.scale_tensor.ops[0].type == Op.DMA:
+            res.add(
+                self.scale_tensor.get_address_ranges_for_coordinates([0], self.scale_tensor.shape),
+                AccessDirection.Read,
+            )
         # Add read access to SHRAM by any LUT-s
         for tens in self.ps.intermediates:
             if tens.purpose == TensorPurpose.LUT and tens.mem_area == MemArea.Shram:
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 01fab0e..871a048 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -238,6 +238,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
                     y_step = y_dim
 
         weight_box = None
+        scale_box = None
 
         for start in range(y_start, y_dim, y_step):
             end = min(start + y_step, y_dim)
@@ -299,6 +300,10 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
                         if ifm_y_present >= ifm_y_needed:
                             break
 
+            if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None:
+                scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape))
+                yield from dma_if_necessary(ps, scale_box, scale_tensor)
+
             if weight_tensor is not None and weight_box is None:
                 weight_box = Box.make_weight_box(
                     weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 0bd0300..04534cc 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -128,7 +128,10 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas
                     else:
                         copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
 
-                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+                    if ps.scale_tensor.ops[0].type == Op.DMA:
+                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor.ops[0].inputs[0])
+                    else:
+                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
 
                 if ps.lut_tensor is not None:
                     copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.lut_tensor)
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 0abd882..e5e4fb1 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -388,8 +388,11 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
             dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
 
             if cmd.in_tensor.compressed_values is not None:
-                stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
-                sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
+                if cmd.out_tensor.purpose == TensorPurpose.FSBias:
+                    sz = cmd.in_tensor.storage_size()
+                else:
+                    stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
+                    sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
             else:
                 sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
 
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 56f4aaa..526cc0e 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -38,6 +38,7 @@ from .npu_performance import PassCycles
 from .numeric_util import full_shape
 from .operation import NpuBlockType
 from .operation import Op
+from .operation import Operation
 from .shared_buffer_allocation import find_block_configs_suitable_for_pass_and_shared_buffer
 from .shared_buffer_allocation import shared_buffer_allocation_for_pass_and_block_config
 from .tensor import MemArea
@@ -64,6 +65,7 @@ class SchedulerOptions:
         use_ifm_streaming=True,
         pareto_metric=ParetoMetric.BwCycMem,
         use_nhcwb16_between_cascaded_passes=True,
+        keep_scale_placement=False,
     ):
         self.use_cascading = use_cascading
         self.verbose_schedule = verbose_schedule
@@ -71,6 +73,7 @@ class SchedulerOptions:
         self.use_ifm_streaming = use_ifm_streaming
         self.pareto_metric = pareto_metric
         self.use_nhcwb16_between_cascaded_passes = use_nhcwb16_between_cascaded_passes
+        self.keep_scale_placement = keep_scale_placement
 
     def __str__(self):
         return type(self).__name__ + ": " + str(self.__dict__)
@@ -1022,6 +1025,45 @@ class DynamicProgrammingScheduler:
                 # in use_fast_storage_for_feature_maps
                 self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites
 
+    def move_scales_to_fast_storage(self, sg, arch):
+        # IFM streamed ops reads bias tensors several times, move these to fast storage
+        for cp in sg.cascaded_passes:
+            if cp.strategy == SchedulingStrategy.IfmStream:
+                for ps in cp.passes:
+                    if ps.scale_tensor and (cp.sram_used + ps.scale_tensor.storage_size()) <= self.sram_limit:
+                        tens = ps.scale_tensor
+
+                        # Find op using scale tensor
+                        op = next((op for op in ps.ops if tens in op.inputs), None)
+                        assert op
+
+                        # Create fast storage tensor
+                        new_tens = tens.clone_into_fast_storage(arch)
+                        new_tens.consumer_list = tens.consumer_list.copy()
+                        new_tens.purpose = TensorPurpose.FSBias
+
+                        # Create DMA cmd
+                        dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma")
+                        dma_cmd.inputs = [tens]
+                        dma_cmd.set_output_tensor(new_tens)
+                        dma_cmd.attrs["source"] = tens.mem_area
+                        dma_cmd.attrs["destination"] = new_tens.mem_area
+                        dma_cmd.run_on_npu = True
+
+                        tens.consumer_list.clear()
+                        tens.consumer_list.append(dma_cmd)
+
+                        # Replace tensor and op
+                        idx = op.inputs.index(tens)
+                        op.inputs[idx] = new_tens
+
+                        ps.ops.insert(0, dma_cmd)
+                        ps.scale_tensor = new_tens
+                        ps.intermediates.append(new_tens)
+                        ps.cascade.intermediates.append(new_tens)
+
+                        cp.sram_used += tens.storage_size()
+
 
 def schedule_passes(nng, arch, options: SchedulerOptions):
 
@@ -1041,6 +1083,9 @@ def schedule_passes(nng, arch, options: SchedulerOptions):
 
         dps.apply_result(strat_set, arch)
 
+        if not options.keep_scale_placement:
+            dps.move_scales_to_fast_storage(sg, arch)
+
         if options.verbose_schedule:
             sg.print_cascaded_passes()
 
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 49f93cd..45518b4 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -81,16 +81,17 @@ class TensorPurpose(enum.IntFlag):
     FeatureMap = 2
     Scratch = 3
     LUT = 4
-    Size = 5
+    FSBias = 5
+    Size = 6
 
     def display_name(self):
-        return ("Unknown", "Weights", "FeatureMap", "Scratch", "LUT", "Size")[self.value]
+        return ("Unknown", "Weights", "FeatureMap", "Scratch", "LUT", "FastStorageBias", "Size")[self.value]
 
     def identifier_name(self):
-        return ("unknown", "weights", "feature_map", "scratch", "lut", "size")[self.value]
+        return ("unknown", "weights", "feature_map", "scratch", "lut", "fast_storage_bias", "size")[self.value]
 
     def all():
-        return (TensorPurpose.Weights, TensorPurpose.FeatureMap)
+        return (TensorPurpose.Weights, TensorPurpose.FeatureMap, TensorPurpose.FSBias)
 
 
 class TensorSubPurpose(enum.Enum):
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 6d54187..4b43751 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -151,6 +151,9 @@ def main(args=None):
     parser.add_argument(
         "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
     )
+    parser.add_argument(
+        "--keep-scale-placement", action="store_true", help="Keep scale tensors memory placement during scheduling"
+    )
     parser.add_argument(
         "--cascading",
         type=ast.literal_eval,
@@ -311,6 +314,7 @@ def main(args=None):
         use_ifm_streaming=args.ifm_streaming,
         pareto_metric=args.pareto_metric,
         use_nhcwb16_between_cascaded_passes=args.nhcwb16_between_cascaded_passes,
+        keep_scale_placement=args.keep_scale_placement,
     )
 
     model_reader_options = model_reader.ModelReaderOptions()
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 9453521..b0187b6 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -404,7 +404,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
 
 
 def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False):
-    assert tens.purpose == TensorPurpose.FeatureMap
+    assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias]
     assert tens.format == TensorFormat.NHWC
     # the connected operator should expect a bias input unless it is a FullyConnected
     assert tens.consumer_list[0].type.needs_bias()
@@ -531,3 +531,9 @@ def update_pass_weight_and_scale_tensors(nng, arch):
                 if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
                     rescale_for_faf = True
                 calc_scales_and_pack_biases(ps.scale_tensor, arch, ofm_depth_step, rescale_for_faf)
+                if ps.scale_tensor.ops[0].type == Op.DMA:
+                    src_tens = ps.scale_tensor.get_dma_src_tensor()
+                    src_tens.shape = ps.scale_tensor.shape
+                    src_tens.quant_values = ps.scale_tensor.quant_values
+                    src_tens.element_size_bytes = ps.scale_tensor.element_size_bytes
+                    src_tens.copy_compressed_weight_info(ps.scale_tensor)
-- 
cgit v1.2.1