From 789e6f3acd1a377dfba80aa18d513579fd33fc93 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Thu, 17 Jun 2021 17:02:31 +0100 Subject: vela: Improve block configuration and weight buffering algorithm - Update block config selection to take into account partial IFM fetches at edge of non-whole OFM block data. - Change to scheduler depth slicing for networks in MLBEDSW-4637 for improved buffering. This helps general performance by buffering larger depth slices. - Bug fix for opt_max_schedule always being fitted to SRAM which prevented the optimisation step running in some cases. Signed-off-by: Tim Hall Change-Id: I97642c5adec3bb684b1daabf2b81574c27d4eef2 --- ethosu/vela/architecture_allocator.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'ethosu/vela/architecture_allocator.py') diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py index c308a4ae..e43b841d 100644 --- a/ethosu/vela/architecture_allocator.py +++ b/ethosu/vela/architecture_allocator.py @@ -279,25 +279,21 @@ def find_block_config( ) if layout: - # Calculate cost in terms of OFM pixels per IFM+Weights fetch - ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth - weight_fetch = weight_fetch_wh * ifm_shape.depth * (1 if is_depthwise else ofm_block.depth) - relative_fetch = (ifm_fetch * ifm_repeats + weight_fetch) / ofm_block.elements() - - # Bias by the number of blocks we'd need to fill the OFM area (fewer, larger, blocks are better) - block_bias = round_up_divide(ofm_shape.height, ofm_block.height) - block_bias *= round_up_divide(ofm_shape.width, ofm_block.width) - # Check waste on all axes (prefer depth, width then height) - waste_ratio = 1 + (1.2 * ((ofm_shape.depth % ofm_block.depth) / ofm_block.depth)) - waste_ratio *= 1 + (1.1 * ((ofm_shape.width % ofm_block.width) / ofm_block.width)) - waste_ratio *= 1 + (1.0 * ((ofm_shape.height % ofm_block.height) / ofm_block.height)) - - # Bias for larger area coverage (or volume if not depthwise) - area_bias = 1 / (ofm_block.height * ofm_block.width) - if not (is_depthwise or is_pooling): - area_bias = area_bias / ofm_block.depth - - relative_cost = relative_fetch * block_bias * waste_ratio * area_bias + full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block) + blocks = ofm_shape / ofm_block + + # Weights fetching + weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh() + if not is_depthwise: + weight_fetch *= ofm_block.depth * blocks.depth + + # IFM fetching + ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh() + if not is_equal_depth_op: + ifm_fetch *= full_blocks.depth + + # Scale relative to every output OFM element + relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements() # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration if ifm_shape.elements() < ifm_block.elements() * 2: -- cgit v1.2.1