MLBEDSW-6645: MLCE: Optimize SRAM usage

- When compiling for shared SRAM the old scheduler has an option so that it produces less SRAM than what the new scheduler manages to produce. The old scheduler was able to creates more/longer cascades. In order to improve the new scheduler, the following has been implemented: - Take persistent IFM's into account when creating the min schedule. - Choose longer cascades when it is possible to reduce the total SRAM usage compared to using shorter cascades. - Updated calculation for estimated SRAM usage for elementwise ops. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I209bbf2d94425e4f6aacb1d151b3b2aa65c0870b
author: Johan Alfvén <johan.alfven@arm.com> 2022-07-16 18:27:05 +0200
committer: Rickard Bolin <rickard.bolin@arm.com> 2022-08-17 07:47:09 +0000
commit: 255dad78c62c8924a44f78d91a573c0aa719cbde (patch)
tree: 0365d094fc2a19cbf819b30ee6292d0d5124f1a2
parent: 8e1352a00dcc0198ae2cd0d8380ef560bd3a847c (diff)
download: ethos-u-vela-255dad78c62c8924a44f78d91a573c0aa719cbde.tar.gz
2 files changed, 117 insertions, 4 deletions
diff --git a/ethosu/vela/cascade_builder.py b/ethosu/vela/cascade_builder.py
index 09c36b9e..09ee73e8 100644
--- a/ethosu/vela/cascade_builder.py
+++ b/ethosu/vela/cascade_builder.py
@@ -16,6 +16,8 @@
 #
 # Description:
 # Groups Operators in a schedule together to form Cascades.
+from collections import namedtuple
+
 from .numeric_util import round_up
 from .operation import NpuBlockType
 from .operation import Op
@@ -98,6 +100,46 @@ class CascadeBuilder:
             and self.element_wise_cascading_conformity(sched_op)
         )
 
+    def _is_mergeable(self, sched_op) -> bool:
+        # Code based on merge_elementwise_op_ranges from live_range.py
+
+        if not sched_op.op_type.is_elementwise_op():
+            return False
+
+        elem_op = sched_op.parent_op
+
+        # Check if overwriting the inputs can be allowed
+        OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
+        outp = OpShapeTens(elem_op.ofm_shapes[0], elem_op.ofm)
+
+        # check output tensor only has one producer
+        if len(outp.tens.ops) != 1:
+            return False
+
+        inps = []
+        if elem_op.ifm is not None:
+            inps.append(OpShapeTens(elem_op.ifm_shapes[0], elem_op.ifm))
+        if elem_op.ifm2 is not None:
+            inps.append(OpShapeTens(elem_op.ifm_shapes[1], elem_op.ifm2))
+
+        # find an input tensor that can be overwritten by the output
+        for inp in inps:
+            if (
+                # check op input and output shapes allow overlapping
+                inp.op_shape == outp.op_shape
+                # check input tensor is valid
+                and inp.tens is not None
+                and inp.tens.shape != []
+                # check input and output tensors are compatible
+                and inp.tens.format == outp.tens.format
+                and inp.tens.dtype == outp.tens.dtype
+                # check input tensor only has one consumer
+                and len(inp.tens.consumer_list) == 1
+            ):
+                return True
+
+        return False
+
     def _estimate_sram_usage(self, sched_op, cost) -> int:
         """Estimate the SRAM required for the Op if all FeatureMaps are in SRAM"""
         ifm2_size = sched_op.ifm2_size_in_bytes()
@@ -115,6 +157,10 @@ class CascadeBuilder:
                 cost.stripe.with_depth(round_up(cost.stripe.depth, 16)).elements() * sched_op.ofm.dtype.size_in_bytes()
             )
 
+        if self._is_mergeable(sched_op):
+            # ofm will use the ifm buffer to reduce SRAM usage, hence ofm_size = 0
+            ofm_size = 0
+
         return ifm_size + ifm2_size + ofm_size + self.non_local_mem_usage.get(sched_op, 0)
 
     @staticmethod
@@ -245,8 +291,52 @@ class CascadeBuilder:
                         # Both the existing cascade and current Op fits
                         break
 
-                    # Determine if current cascade is the best so far
-                    if cascade_size < best_cascade_size:
+                    """
+                    One of two conditions will update the best cascade:
+
+                    - cascade_size < best_cascade_size or
+                    - cascade_size < uncascaded_sram_usage
+
+                    The last condition is illustrated below, showing an example where it is
+                    better to choose a larger cascade_size (with more OPs) because it will
+                    use less total SRAM usage.
+
+                    For simplicity, all featuremaps have same size.
+
+                    Cascade OP1-OP2, OP3 is standalone
+
+                                ->  |OP1| -> roll buffer -> |OP2| -> FM -> |OP3| -> FM
+                               /
+                    |OP0| -> FM
+                               \
+                                ->  ....
+
+
+                    best_cascade_size    : FM + roll buffer + FM
+                    uncascaded_sram_usage: FM + FM + FM
+
+                    compared with:
+
+                    Cascade OP1-OP3
+
+                                ->  |OP1| -> roll buffer -> |OP2| -> roll buffer -> |OP3| -> FM
+                               /
+                    |OP0| -> FM
+                               \
+                                ->  ....
+
+
+                    cascade_size         : FM + roll buffer + roll buffer + FM
+
+
+                    So, for this use case the comparison will be
+
+                    (FM + roll buffer + roll buffer + FM) <  (FM + roll buffer + FM) or
+                    (FM + roll buffer + roll buffer + FM) <  (FM + FM + FM)
+
+                    hence, better to choose Cascade OP1-OP3 in this case.
+                    """
+                    if cascade_size < best_cascade_size or cascade_size < uncascaded_sram_usage:
                         best_cascade_size = cascade_ifm_size + cascade_buffers + op_full_ofm
                         ops_in_best_cascade = [op for op in ops_in_cascade]
 
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index d01942bb..e9f38b4d 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -941,6 +941,29 @@ class Scheduler:
 
         return peak_mem_usage
 
+    def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
+        # Update memory snapshot
+        self.sg.schedule = min_schedule
+        self.update_op_memory_snapshot(min_schedule)
+
+        # Calculate residual memory for Min schedule
+        non_local_mem_usage = {}
+        for sched_op in self.sched_ops:
+            time_index = min_schedule.cost_map[sched_op].time_index
+
+            if self.arch.is_spilling_enabled():
+                # For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0
+                op_mem_usage = 0
+            else:
+                # Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)
+                op_mem_usage = sched_op.ifm_size_in_bytes() + sched_op.ofm_size_in_bytes()
+
+            non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage
+
+        # Crate cascades for Min schedule
+        cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
+        cascade_builder.build_cascades(min_schedule, max_template, memory_limit)
+
     def optimize_sub_schedule(
         self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
     ) -> Schedule:
@@ -1545,8 +1568,8 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
             if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
                 initial_sram_limit = scheduler.min_memory_req
 
-            cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())
-            cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)
+            # Build cascades for Min schedule
+            scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
             sg.schedule = min_schedule
             scheduler.update_op_memory_snapshot(min_schedule)
author	Johan Alfvén <johan.alfven@arm.com>	2022-07-16 18:27:05 +0200
committer	Rickard Bolin <rickard.bolin@arm.com>	2022-08-17 07:47:09 +0000
commit	255dad78c62c8924a44f78d91a573c0aa719cbde (patch)
tree	0365d094fc2a19cbf819b30ee6292d0d5124f1a2
parent	8e1352a00dcc0198ae2cd0d8380ef560bd3a847c (diff)
download	ethos-u-vela-255dad78c62c8924a44f78d91a573c0aa719cbde.tar.gz