From 789e6f3acd1a377dfba80aa18d513579fd33fc93 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Thu, 17 Jun 2021 17:02:31 +0100 Subject: vela: Improve block configuration and weight buffering algorithm - Update block config selection to take into account partial IFM fetches at edge of non-whole OFM block data. - Change to scheduler depth slicing for networks in MLBEDSW-4637 for improved buffering. This helps general performance by buffering larger depth slices. - Bug fix for opt_max_schedule always being fitted to SRAM which prevented the optimisation step running in some cases. Signed-off-by: Tim Hall Change-Id: I97642c5adec3bb684b1daabf2b81574c27d4eef2 --- ethosu/vela/architecture_allocator.py | 34 +++++++++++-------------- ethosu/vela/npu_performance.py | 1 + ethosu/vela/scheduler.py | 47 +++++++++++++++++++++++------------ 3 files changed, 47 insertions(+), 35 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py index c308a4ae..e43b841d 100644 --- a/ethosu/vela/architecture_allocator.py +++ b/ethosu/vela/architecture_allocator.py @@ -279,25 +279,21 @@ def find_block_config( ) if layout: - # Calculate cost in terms of OFM pixels per IFM+Weights fetch - ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth - weight_fetch = weight_fetch_wh * ifm_shape.depth * (1 if is_depthwise else ofm_block.depth) - relative_fetch = (ifm_fetch * ifm_repeats + weight_fetch) / ofm_block.elements() - - # Bias by the number of blocks we'd need to fill the OFM area (fewer, larger, blocks are better) - block_bias = round_up_divide(ofm_shape.height, ofm_block.height) - block_bias *= round_up_divide(ofm_shape.width, ofm_block.width) - # Check waste on all axes (prefer depth, width then height) - waste_ratio = 1 + (1.2 * ((ofm_shape.depth % ofm_block.depth) / ofm_block.depth)) - waste_ratio *= 1 + (1.1 * ((ofm_shape.width % ofm_block.width) / ofm_block.width)) - waste_ratio *= 1 + (1.0 * ((ofm_shape.height % ofm_block.height) / ofm_block.height)) - - # Bias for larger area coverage (or volume if not depthwise) - area_bias = 1 / (ofm_block.height * ofm_block.width) - if not (is_depthwise or is_pooling): - area_bias = area_bias / ofm_block.depth - - relative_cost = relative_fetch * block_bias * waste_ratio * area_bias + full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block) + blocks = ofm_shape / ofm_block + + # Weights fetching + weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh() + if not is_depthwise: + weight_fetch *= ofm_block.depth * blocks.depth + + # IFM fetching + ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh() + if not is_equal_depth_op: + ifm_fetch *= full_blocks.depth + + # Scale relative to every output OFM element + relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements() # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration if ifm_shape.elements() < ifm_block.elements() * 2: diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py index 5c61c7db..21b420bf 100644 --- a/ethosu/vela/npu_performance.py +++ b/ethosu/vela/npu_performance.py @@ -410,6 +410,7 @@ def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQue def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer): from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area] + from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read] to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area] return max(from_cycles, to_cycles) diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index dfb8867e..de2189bc 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -459,10 +459,9 @@ class Scheduler: return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query) - def propose_schedule_buffering(self, ref_schedule: Schedule): + def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes): """Create a buffered schedule""" buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED") - staging_limit_bytes = self.scheduler_options.optimization_sram_limit prev_op = None for sched_op in self.sched_ops: @@ -588,24 +587,35 @@ class Scheduler: prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit) else: prebuffer_bytes = min(full_weights_bytes, half_buffer_limit) - prebuffer_ratio = prebuffer_bytes / full_weights_bytes + + prebuffer_ratio = prebuffer_bytes / full_weights_bytes # Have to split the weights if the initial buffering can't store # all of the compressed weights if prebuffer_bytes < full_weights_bytes: - prebuffer_depth = int(ref_cost.stripe.depth * prebuffer_ratio) + block_depth = cost.block_config.ofm_block.depth - # Round prebuffering down to nearest valid split depth + # Choose initial prebuffering depth (already buffer clamped) + prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth))) - while True: - buffering_depth = max(cost.block_config.ofm_block.depth, prebuffer_depth) + # Calculate cycles executed during the prebuffer + pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth) + buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles) - # Clamp buffering to the double buffering limit - buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes - if buffering_bytes > half_buffer_limit: - buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth - buffering_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth))) + # Choose initial buffering depth and clamp to the double buffering limit + buffering_depth = round_up(buffering_depth, block_depth) + buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes + if buffering_bytes > half_buffer_limit: + buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth + + while True: + # Attempt to buffer whole blocks + if buffering_bytes > block_depth: + buffering_depth = round_down(buffering_depth, block_depth) + else: + buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth) + buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth)) # Create list of depth slices depth_slices = [0] @@ -633,7 +643,10 @@ class Scheduler: ): break - prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth) + if buffering_depth > prebuffer_depth: + buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth) + else: + prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth) # Calculate cycles required to run the last op for use as future slack tail_cycles = self.estimate_op_performance( @@ -790,7 +803,9 @@ class Scheduler: cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage) # Start by adding buffering - buffered_sub_schedule = self.propose_schedule_buffering(sub_schedule) + buffered_sub_schedule = self.propose_schedule_buffering( + sub_schedule, self.scheduler_options.optimization_sram_limit + ) # Copy the cascades over from the unbuffered-schedule buffered_sub_schedule.cascades = sub_schedule.cascades @@ -852,7 +867,7 @@ class Scheduler: self.sg.schedule = schedule self.update_op_memory_snapshot(schedule) # Propose schedule buffering to the optimized schedule - optimized_sched = self.propose_schedule_buffering(schedule) + optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit) # Copy the cascade's metadata from the unbuffered schedule optimized_sched.cascades = schedule.cascades return optimized_sched @@ -1047,7 +1062,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o # Create the optimimised Max schedule sg.schedule = max_schedule_template scheduler.update_op_memory_snapshot(max_schedule_template) - opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template) + opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32) sg.schedule = opt_max_schedule scheduler.update_op_memory_snapshot(opt_max_schedule) -- cgit v1.2.1