aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohan Alfvén <johan.alfven@arm.com>2022-08-17 14:59:58 +0200
committerJohan Alfvén <johan.alfven@arm.com>2022-10-21 11:07:46 +0200
commit2a285fca30d13f6577ef3e8154aea24713d728a5 (patch)
tree00a37a5eaba8c28ee944bf5772b3e96ae59887c0
parent56a71b0108f43a1cb118b1e2fae902c31b2a9969 (diff)
downloadethos-u-vela-2a285fca30d13f6577ef3e8154aea24713d728a5.tar.gz
MLBEDSW-6840: New stripe algo for optimize sub schedule
- The algorithm for trying out different stripes in order to optimize a sub schedule/cascade, have a problem that it can split the initial cascade into several smaller cascades. The problem with this is that it will increase IFM/OFM DRAM bandwith and performance will drop. - Changed the stripe algorithm to prefer long cascades. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I4f38b381597b7094819e9dd463aa1876e4e6bc62
-rw-r--r--ethosu/vela/scheduler.py33
1 files changed, 21 insertions, 12 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 79cd6421..ec7380a6 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1029,14 +1029,19 @@ class Scheduler:
# Generate the possible stripings for the final Op in the sub-schedule
final_ofm_shape = sub_schedule_ops[-1].ofm.shape
+
+ # Skip testing the min stripe used in the MIN schedule since that will be used
+ # anyway if no new cascades are created below
+ last_op = sub_schedule_ops[-1]
+ min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1
+
possible_stripes = [
- final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)
+ final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)
]
- # Propose different striping - the possible stripes are proposed similarly to a binary search
+ # Propose different striping
best_schedule = None
- iteration = 0
- while len(possible_stripes) > 1:
- proposed_stripe = possible_stripes[len(possible_stripes) // 2]
+ max_nbr_of_cascades = 0
+ for iteration, proposed_stripe in enumerate(possible_stripes):
proposed_schedule = self.propose_schedule_striping(
proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
)
@@ -1045,18 +1050,22 @@ class Scheduler:
# Check if proposal fits
proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
- if (proposed_schedule_mem_usage) <= memory_limit:
- # Remove all possible stripes smaller than this
- possible_stripes = possible_stripes[len(possible_stripes) // 2 :]
+
+ nbr_of_cascades = len(proposed_schedule.cascades)
+
+ if iteration == 0:
+ # First iteration - used as limit to prevent splitting up the cascades
+ # Long cascades are better in order to reduce IFM/IFM dram bandwidth
+ max_nbr_of_cascades = nbr_of_cascades
+
+ if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:
best_schedule = proposed_schedule
+
if not proposed_schedule.cascades:
# No cascading required - early exit
break
else:
- # Proposal doesn't fit within the limit - remove all possible stripes larger than this
- possible_stripes = possible_stripes[: len(possible_stripes) // 2]
-
- iteration += 1
+ break
return best_schedule