aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2021-06-17 17:02:31 +0100
committerTim Hall <tim.hall@arm.com>2021-06-17 17:02:31 +0100
commit789e6f3acd1a377dfba80aa18d513579fd33fc93 (patch)
treea3f44fafb91f26032c2d273aef6c602da05d4715
parenta5e8c1c3470409566723919e878b17297a52c54b (diff)
downloadethos-u-vela-789e6f3acd1a377dfba80aa18d513579fd33fc93.tar.gz
vela: Improve block configuration and weight buffering algorithm
- Update block config selection to take into account partial IFM fetches at edge of non-whole OFM block data. - Change to scheduler depth slicing for networks in MLBEDSW-4637 for improved buffering. This helps general performance by buffering larger depth slices. - Bug fix for opt_max_schedule always being fitted to SRAM which prevented the optimisation step running in some cases. Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I97642c5adec3bb684b1daabf2b81574c27d4eef2
-rw-r--r--ethosu/vela/architecture_allocator.py34
-rw-r--r--ethosu/vela/npu_performance.py1
-rw-r--r--ethosu/vela/scheduler.py47
3 files changed, 47 insertions, 35 deletions
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index c308a4ae..e43b841d 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -279,25 +279,21 @@ def find_block_config(
)
if layout:
- # Calculate cost in terms of OFM pixels per IFM+Weights fetch
- ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth
- weight_fetch = weight_fetch_wh * ifm_shape.depth * (1 if is_depthwise else ofm_block.depth)
- relative_fetch = (ifm_fetch * ifm_repeats + weight_fetch) / ofm_block.elements()
-
- # Bias by the number of blocks we'd need to fill the OFM area (fewer, larger, blocks are better)
- block_bias = round_up_divide(ofm_shape.height, ofm_block.height)
- block_bias *= round_up_divide(ofm_shape.width, ofm_block.width)
- # Check waste on all axes (prefer depth, width then height)
- waste_ratio = 1 + (1.2 * ((ofm_shape.depth % ofm_block.depth) / ofm_block.depth))
- waste_ratio *= 1 + (1.1 * ((ofm_shape.width % ofm_block.width) / ofm_block.width))
- waste_ratio *= 1 + (1.0 * ((ofm_shape.height % ofm_block.height) / ofm_block.height))
-
- # Bias for larger area coverage (or volume if not depthwise)
- area_bias = 1 / (ofm_block.height * ofm_block.width)
- if not (is_depthwise or is_pooling):
- area_bias = area_bias / ofm_block.depth
-
- relative_cost = relative_fetch * block_bias * waste_ratio * area_bias
+ full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
+ blocks = ofm_shape / ofm_block
+
+ # Weights fetching
+ weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
+ if not is_depthwise:
+ weight_fetch *= ofm_block.depth * blocks.depth
+
+ # IFM fetching
+ ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
+ if not is_equal_depth_op:
+ ifm_fetch *= full_blocks.depth
+
+ # Scale relative to every output OFM element
+ relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()
# If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
if ifm_shape.elements() < ifm_block.elements() * 2:
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 5c61c7db..21b420bf 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -410,6 +410,7 @@ def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQue
def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
+ from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
return max(from_cycles, to_cycles)
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index dfb8867e..de2189bc 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -459,10 +459,9 @@ class Scheduler:
return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
- def propose_schedule_buffering(self, ref_schedule: Schedule):
+ def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
"""Create a buffered schedule"""
buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
- staging_limit_bytes = self.scheduler_options.optimization_sram_limit
prev_op = None
for sched_op in self.sched_ops:
@@ -588,24 +587,35 @@ class Scheduler:
prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
else:
prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
- prebuffer_ratio = prebuffer_bytes / full_weights_bytes
+
+ prebuffer_ratio = prebuffer_bytes / full_weights_bytes
# Have to split the weights if the initial buffering can't store
# all of the compressed weights
if prebuffer_bytes < full_weights_bytes:
- prebuffer_depth = int(ref_cost.stripe.depth * prebuffer_ratio)
+ block_depth = cost.block_config.ofm_block.depth
- # Round prebuffering down to nearest valid split depth
+ # Choose initial prebuffering depth (already buffer clamped)
+ prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
- while True:
- buffering_depth = max(cost.block_config.ofm_block.depth, prebuffer_depth)
+ # Calculate cycles executed during the prebuffer
+ pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
+ buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
- # Clamp buffering to the double buffering limit
- buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
- if buffering_bytes > half_buffer_limit:
- buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
- buffering_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
+ # Choose initial buffering depth and clamp to the double buffering limit
+ buffering_depth = round_up(buffering_depth, block_depth)
+ buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
+ if buffering_bytes > half_buffer_limit:
+ buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
+
+ while True:
+ # Attempt to buffer whole blocks
+ if buffering_bytes > block_depth:
+ buffering_depth = round_down(buffering_depth, block_depth)
+ else:
+ buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
+ buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
# Create list of depth slices
depth_slices = [0]
@@ -633,7 +643,10 @@ class Scheduler:
):
break
- prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
+ if buffering_depth > prebuffer_depth:
+ buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
+ else:
+ prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
# Calculate cycles required to run the last op for use as future slack
tail_cycles = self.estimate_op_performance(
@@ -790,7 +803,9 @@ class Scheduler:
cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
# Start by adding buffering
- buffered_sub_schedule = self.propose_schedule_buffering(sub_schedule)
+ buffered_sub_schedule = self.propose_schedule_buffering(
+ sub_schedule, self.scheduler_options.optimization_sram_limit
+ )
# Copy the cascades over from the unbuffered-schedule
buffered_sub_schedule.cascades = sub_schedule.cascades
@@ -852,7 +867,7 @@ class Scheduler:
self.sg.schedule = schedule
self.update_op_memory_snapshot(schedule)
# Propose schedule buffering to the optimized schedule
- optimized_sched = self.propose_schedule_buffering(schedule)
+ optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
# Copy the cascade's metadata from the unbuffered schedule
optimized_sched.cascades = schedule.cascades
return optimized_sched
@@ -1047,7 +1062,7 @@ def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_o
# Create the optimimised Max schedule
sg.schedule = max_schedule_template
scheduler.update_op_memory_snapshot(max_schedule_template)
- opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template)
+ opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
sg.schedule = opt_max_schedule
scheduler.update_op_memory_snapshot(opt_max_schedule)