From 2a285fca30d13f6577ef3e8154aea24713d728a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johan=20Alfv=C3=A9n?= Date: Wed, 17 Aug 2022 14:59:58 +0200 Subject: MLBEDSW-6840: New stripe algo for optimize sub schedule - The algorithm for trying out different stripes in order to optimize a sub schedule/cascade, have a problem that it can split the initial cascade into several smaller cascades. The problem with this is that it will increase IFM/OFM DRAM bandwith and performance will drop. - Changed the stripe algorithm to prefer long cascades. Signed-off-by: Johan Alfven Change-Id: I4f38b381597b7094819e9dd463aa1876e4e6bc62 --- ethosu/vela/scheduler.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'ethosu') diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 79cd6421..ec7380a6 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -1029,14 +1029,19 @@ class Scheduler: # Generate the possible stripings for the final Op in the sub-schedule final_ofm_shape = sub_schedule_ops[-1].ofm.shape + + # Skip testing the min stripe used in the MIN schedule since that will be used + # anyway if no new cascades are created below + last_op = sub_schedule_ops[-1] + min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1 + possible_stripes = [ - final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1) + final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1) ] - # Propose different striping - the possible stripes are proposed similarly to a binary search + # Propose different striping best_schedule = None - iteration = 0 - while len(possible_stripes) > 1: - proposed_stripe = possible_stripes[len(possible_stripes) // 2] + max_nbr_of_cascades = 0 + for iteration, proposed_stripe in enumerate(possible_stripes): proposed_schedule = self.propose_schedule_striping( proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule ) @@ -1045,18 +1050,22 @@ class Scheduler: # Check if proposal fits proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage) - if (proposed_schedule_mem_usage) <= memory_limit: - # Remove all possible stripes smaller than this - possible_stripes = possible_stripes[len(possible_stripes) // 2 :] + + nbr_of_cascades = len(proposed_schedule.cascades) + + if iteration == 0: + # First iteration - used as limit to prevent splitting up the cascades + # Long cascades are better in order to reduce IFM/IFM dram bandwidth + max_nbr_of_cascades = nbr_of_cascades + + if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades: best_schedule = proposed_schedule + if not proposed_schedule.cascades: # No cascading required - early exit break else: - # Proposal doesn't fit within the limit - remove all possible stripes larger than this - possible_stripes = possible_stripes[: len(possible_stripes) // 2] - - iteration += 1 + break return best_schedule -- cgit v1.2.1