aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2022-03-17 15:59:04 +0100
committerFredrik Svedberg <fredrik.svedberg@arm.com>2022-03-21 07:54:54 +0000
commitd2b5510697e7789f5a416f9d80d3cb640eecc092 (patch)
tree548e9822e075fd569d75cee73c6d455d945217af
parent43d275875bb78163604ec116e06153e53d2fcbc1 (diff)
downloadethos-u-vela-d2b5510697e7789f5a416f9d80d3cb640eecc092.tar.gz
MLBEDSW-6312: Find block config improvement
- The number of accumulators is doubled in an Ethos-U configuration with 2 cores - Likewise, for elementwise, depthwise and pooling operations the IFM buffer depth capacity is doubled - FindBlock: step the search space depth in multiples of ublock * ncores Change-Id: I923cc347a2f252876d405ed93095d39181103f81 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--ethosu/vela/architecture_allocator.py54
1 files changed, 49 insertions, 5 deletions
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index 65a684c..84d8354 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -51,6 +51,7 @@ class ArchitectureBlockConfig:
self.acc_type = SHRAMElements.Acc32
self.is_partkernel = False
self.bank_size = 0
+ self.ifm_depth_buf_scaling = 0
def get_shram_memory_access_range(self):
# Returns the SHRAM memory access range used by this shared buffer,
@@ -83,12 +84,18 @@ def _try_block_config(
acc_bits: int,
acc_granule: int,
lut_banks: int,
+ ifm_depth_buf_scaling: int,
+ cores: int,
) -> SHRAMLayout:
assert (acc_bits > 0) and (acc_granule > 0)
assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
+ # Scale depth with cores
+ ifm_depth = round_up_divide(ifm_block.depth, ifm_depth_buf_scaling)
+ ofm_depth = round_up_divide(ofm_block.depth, cores)
+
# Aways need IFM space
- ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8)
+ ifm_bytes = ifm_block.elements_wh() * round_up((ifm_depth * ifm_bits) / 8, 8)
ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
ifm_banks = round_up(ifm_banks, ifm_granule)
@@ -100,7 +107,7 @@ def _try_block_config(
# If not elementwise then we need accumulator space
if ew_usage == ElementwiseUsage.No:
- acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8
+ acc_bytes = (ofm_block.elements_wh() * round_up(ofm_depth, 8) * acc_bits) // 8
acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
acc_banks = round_up(acc_banks, acc_granule)
acc_start = acc_start - acc_banks
@@ -246,6 +253,14 @@ def find_block_config(
config = ArchitectureBlockConfig()
config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)
+ # IFM is not broadcasted for pooling and depthwise ops and for elementwise
+ # when there's no elementwise-broadcasting in depth
+ elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
+ not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
+ )
+ ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
+ config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
+
# Accumulator & granule settings
config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
@@ -269,8 +284,9 @@ def find_block_config(
# Weights fetch (for operators that have them)
weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0
+ ofm_ublock_depth = arch.ofm_ublock.depth * arch.ncores
search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
- search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()))
+ search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()).with_depth(ofm_ublock_depth))
# Block WHC search, loops across the search space looking for best efficiency
best_cost = math.inf
@@ -297,7 +313,17 @@ def find_block_config(
# Test if the IFM/OFM blocks fit into SHRAM
ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
layout = _try_block_config(
- arch.shram, ew_usage, ofm_block, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
+ arch.shram,
+ ew_usage,
+ ofm_block,
+ ifm_block,
+ ifm_bits,
+ ifm_granule,
+ acc_bits,
+ acc_granule,
+ lut_banks,
+ ifm_depth_buf_scaling,
+ arch.ncores,
)
if layout:
@@ -395,6 +421,14 @@ def try_block_config(
config = ArchitectureBlockConfig()
config.is_partkernel = is_partkernel
+ # IFM is not broadcasted for pooling and depthwise ops and for elementwise
+ # when there's no elementwise-broadcasting in depth
+ elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
+ not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
+ )
+ ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
+ config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
+
# Accumulator & granule settings
config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
@@ -417,7 +451,17 @@ def try_block_config(
block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
layout = _try_block_config(
- arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
+ arch.shram,
+ ew_usage,
+ block_config_opt,
+ ifm_block,
+ ifm_bits,
+ ifm_granule,
+ acc_bits,
+ acc_granule,
+ lut_banks,
+ ifm_depth_buf_scaling,
+ arch.ncores,
)
if layout is None:
return None