aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2020-11-20 13:08:55 +0100
committerLouis Verhaard <louis.verhaard@arm.com>2020-11-26 17:18:48 +0100
commitd2665804871d76a16d5962952ba95500e3977c56 (patch)
treeea7fc78d7dae7f4258939cd2cfa8cffad92e566d
parent603016ccaa6cdb1a9b6d4547c561e4b45c90d3d5 (diff)
downloadethos-u-vela-d2665804871d76a16d5962952ba95500e3977c56.tar.gz
MLBEDSW-3562: Improve blockdep calculation
Blockdep calculation can now handle different sized IFM/OFM. Change-Id: I898a3c1c3a6778916802f3dbfa658328e5093096 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--ethosu/vela/architecture_features.py23
-rw-r--r--ethosu/vela/register_command_stream_generator.py134
-rw-r--r--ethosu/vela/test/test_register_command_generator.py102
3 files changed, 213 insertions, 46 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 9f27b7e..64005bf 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -510,12 +510,9 @@ class ArchitectureFeatures:
start_coord[1] + ifm_block.height,
start_coord[2] + ifm_block.depth,
)
-
return (start_coord, end_coord, 1) # start, end, total jobs
- def get_prev_job_output_volume(
- self, ifm: Rect, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, block_offset
- ):
+ def get_prev_job_output_volume(self, ofm: Rect, ofm_block: Block, block_offset):
assert block_offset >= 0
# Get OFM block's volume coordinates
@@ -527,28 +524,20 @@ class ArchitectureFeatures:
start_coord[1] + ofm_block.height,
start_coord[2] + ofm_block.depth,
)
-
- # Calculate how many IFM blocks this OFM block requires (i.e how many jobs)
- ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth)
- ifm_depth_blocks = 1 # Overwrite with 1 to force OFM block dependency, not IFM
-
- return (start_coord, end_coord, ifm_depth_blocks) # start, end, total jobs for this OFM block
+ return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
def calc_block_dep(
self,
- prev_ifm: Rect,
prev_ofm: Rect,
- prev_ifm_block_depth,
prev_ofm_block: Block,
- prev_kernel: Kernel,
ifm: Rect,
ofm: Rect,
ifm_block_depth,
ofm_block: Block,
kernel: Kernel,
padLT,
+ intersects,
):
-
blockdep = ArchitectureFeatures.MAX_BLOCKDEP
# Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
@@ -566,16 +555,14 @@ class ArchitectureFeatures:
outstanding_jobs = 0
for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
# This is the OFM block being generated by the previous op
- out_area = self.get_prev_job_output_volume(
- prev_ifm, prev_ofm, prev_ifm_block_depth, prev_ofm_block, prev_kernel, block_offset
- )
+ out_area = self.get_prev_job_output_volume(prev_ofm, prev_ofm_block, block_offset)
if out_area is None:
break
# Block dependency is the max number of allowed outstanding jobs
# in the pipeline. Selected by determining how many jobs occur
# in between two operators' overlapping OFM->IFM block volumes
- if ArchitectureFeatures.intersects(in_area[0], in_area[1], out_area[0], out_area[1]):
+ if intersects(in_area[0], in_area[1], out_area[0], out_area[1]):
break
# Early exit if no intersections and we've seen enough jobs in the pipeline
elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 015a8c4..741b09c 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -23,6 +23,7 @@ from enum import Enum
from enum import IntEnum
from typing import List
from typing import Optional
+from typing import Tuple
import numpy as np
@@ -745,6 +746,17 @@ def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
)
+def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
+ """Checks if there is any address overlap between list1 and list2"""
+ for range1 in list1:
+ if range1 is None:
+ continue
+ for range2 in list2:
+ if range2 is not None and ranges_overlap(range1, range2):
+ return True
+ return False
+
+
def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
"""Calculates STRIDE_C/Y/X"""
if fm.strides is not None:
@@ -785,12 +797,63 @@ def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int)
def get_address_range(
fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
) -> NpuAddressRange:
- """Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
+ """
+ Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
+ The begin and end coordinates must be within the same tile.
+ """
addr0 = get_address(fm, strides, y0, x0, c0)
addr1 = get_address(fm, strides, y1, x1, c1)
return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
+def get_h_ranges(
+ fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
+) -> List[NpuAddressRange]:
+ """
+ Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
+ the begin and end coordinates must be within the same tile.
+ Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
+ """
+ return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
+
+
+def get_address_ranges_for_area(
+ fm: NpuFeatureMap, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
+) -> List[NpuAddressRange]:
+ """
+ Returns a list of adddress ranges that covers the area (y0, x0, c0) - (y1, x1, c1) (inclusive).
+ Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
+
+ For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
+ 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
+
+ .....|.... .....|....
+ t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
+ ..XXX|XX.. ..BBB|DD..
+ -----+---- --> -----+----
+ t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
+ .....|.... .....|....
+ """
+ strides = get_strides(fm)
+ height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
+ h, w, c = fm.shape
+ y2, x2, c2 = min(y1, h - 1), min(x1, w - 1), min(c1, c - 1)
+ ranges = []
+ if x0 < width_0 and y0 < height_0:
+ # Horizontal ranges for tile 0
+ ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y2, height_0 - 1), min(x2, width_0 - 1), c2))
+ if x2 >= width_0 and y0 < height_1:
+ # Horizontal ranges for tile 1
+ ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y2, height_1 - 1), x2, c2))
+ if x0 < width_0 and y2 >= height_0:
+ # Horizontal ranges for tile 2
+ ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y2, min(x2, width_0 - 1), c2))
+ if x2 >= width_0 and y2 >= height_1:
+ # Horizontal ranges for tile 3
+ ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y2, x2, c2))
+ return ranges
+
+
def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
"""Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
strides = get_strides(fm)
@@ -806,7 +869,7 @@ def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
else:
t2 = None
if t1 is not None and t2 is not None:
- t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
+ t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
else:
t3 = None
return [t0, t1, t2, t3]
@@ -934,22 +997,8 @@ def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
# -------------------------------------------------------------------
-def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
- """Checks if npu_op's input is dependent on prev_op's output"""
- assert npu_op.ifm is not None
- assert prev_op.ofm is not None
- curr_input_ranges = get_address_ranges(npu_op.ifm)
-
- if has_ifm2(npu_op):
- assert npu_op.ifm2 is not None
- curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
- for prev_range in get_address_ranges(prev_op.ofm):
- if prev_range is None:
- continue
- for curr_range in curr_input_ranges:
- if curr_range is not None and ranges_overlap(prev_range, curr_range):
- return True
- return False
+def shape3d_size(shape: NpuShape3D) -> int:
+ return shape.width * shape.height * shape.depth
def shape3d_to_rect(shape: NpuShape3D) -> Rect:
@@ -970,35 +1019,66 @@ def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperatio
"""Calculates the value of the BLOCKDEP register"""
if prev_op is None:
return 0
- if not is_dependent_on_prev_op(prev_op, npu_op):
+ assert npu_op.ifm is not None
+ assert prev_op.ofm is not None
+ # Check if IFM or IFM2 overlaps with prev op's OFM
+ prev_ofm_ranges = get_address_ranges(prev_op.ofm)
+ ifm_ranges = get_address_ranges(npu_op.ifm)
+ ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
+ if has_ifm2(npu_op):
+ assert npu_op.ifm2 is not None
+ ifm2_ranges = get_address_ranges(npu_op.ifm2)
+ ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
+ else:
+ ifm2_overlaps = False
+ if ifm_overlaps and ifm2_overlaps:
+ # Both IFM and IFM2 overlap (should be rare)
+ return 0
+ if not ifm_overlaps and not ifm2_overlaps:
+ # No overlap between prev OFM and IFM/IFM2
return ArchitectureFeatures.MAX_BLOCKDEP
- if prev_op.ofm.shape != npu_op.ifm.shape:
+ if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
+ # Prev OFM produces IFM2 which is broadcasted (this should be rare)
return 0
prev_block_config = prev_op.block_config
block_config = npu_op.block_config
- prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
+ overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
+ assert overlapping_fm is not None
+
+ def intersects(ifm_start_coord: Tuple, ifm_end_coord: Tuple, ofm_start_coord: Tuple, ofm_end_coord: Tuple) -> bool:
+ """Checks if the given IFM area overlaps with the given OFM area"""
+ if overlapping_fm.shape == prev_op.ofm.shape and overlapping_fm.tiles == prev_op.ofm.tiles:
+ # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
+ # if the xyz coordinates overlap, which is quick and easy
+ return ArchitectureFeatures.intersects(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
+ # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
+ # In this case address comparison is needed between the two areas
+ x0, y0, c0 = ifm_start_coord
+ x1, y1, c1 = ifm_end_coord
+ ifm_ranges = get_address_ranges_for_area(overlapping_fm, y0, x0, c0, y1, x1, c1)
+ x0, y0, c0 = ofm_start_coord
+ x1, y1, c1 = ofm_end_coord
+ prev_ofm_ranges = get_address_ranges_for_area(prev_op.ofm, y0, x0, c0, y1, x1, c1)
+ return range_lists_overlap(ifm_ranges, prev_ofm_ranges)
+
prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
- prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
- blockdep = arch.calc_block_dep(
- prev_ifm_rect,
+ return arch.calc_block_dep(
prev_ofm_rect,
- prev_ifm_block_depth,
prev_ofm_block,
- to_kernel(prev_op.kernel),
cur_ifm_rect,
cur_ofm_rect,
cur_ifm_block_depth,
cur_ofm_block,
to_kernel(npu_op.kernel),
cur_padLT,
+ intersects=intersects,
)
- return blockdep
# -------------------------------------------------------------------
diff --git a/ethosu/vela/test/test_register_command_generator.py b/ethosu/vela/test/test_register_command_generator.py
index f2a1609..2760c86 100644
--- a/ethosu/vela/test/test_register_command_generator.py
+++ b/ethosu/vela/test/test_register_command_generator.py
@@ -17,13 +17,24 @@
# Description:
# Contains unit tests for register command stream generator
from ethosu.vela.api import NpuAddressRange
+from ethosu.vela.api import NpuBlockTraversal
+from ethosu.vela.api import NpuConv2DOperation
+from ethosu.vela.api import NpuConvDepthWiseOperation
from ethosu.vela.api import NpuDataType
+from ethosu.vela.api import NpuElementWiseOp
+from ethosu.vela.api import NpuElementWiseOperation
from ethosu.vela.api import NpuFeatureMap
+from ethosu.vela.api import NpuKernel
from ethosu.vela.api import NpuLayout
+from ethosu.vela.api import NpuPadding
from ethosu.vela.api import NpuShape3D
from ethosu.vela.api import NpuTileBox
+from ethosu.vela.architecture_features import Accelerator
+from ethosu.vela.architecture_features import create_default_arch
+from ethosu.vela.register_command_stream_generator import calc_blockdep
from ethosu.vela.register_command_stream_generator import get_address_ranges
from ethosu.vela.register_command_stream_generator import get_strides
+from ethosu.vela.test.extapi.test_extapi_generate_commands import create_feature_map
def test_get_fm_strides():
@@ -39,6 +50,11 @@ def test_get_fm_strides():
assert get_strides(fm) == NpuShape3D(height=240, width=24, depth=1)
+# -------------------------------------------------------------------
+# ADDRESS TESTS
+# -------------------------------------------------------------------
+
+
def test_get_address_ranges_one_tile():
"""Tests calculation of feature map address ranges, with 1 tile used"""
fm = NpuFeatureMap()
@@ -100,5 +116,89 @@ def test_get_address_ranges_4_tiles():
NpuAddressRange(region=6, address=16, length=18952),
NpuAddressRange(region=6, address=32000, length=6280),
NpuAddressRange(region=6, address=8000, length=12552),
- NpuAddressRange(region=6, address=28800, length=12680),
+ NpuAddressRange(region=6, address=16000, length=25480),
]
+
+
+# -------------------------------------------------------------------
+# BLOCKDEP TESTS
+# -------------------------------------------------------------------
+
+
+def test_calc_blockdep0():
+ """
+ Tests blockdep calculation, op1 that produces op2's IFM2.
+ op2 takes 1 block to complete, which results in blockdep 0
+ """
+ op1 = NpuElementWiseOperation(NpuElementWiseOp.CLZ)
+ op1.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x60, layout=NpuLayout.NHCWB16,)
+ intermediate_fm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xA0, layout=NpuLayout.NHCWB16,)
+ op1.ofm = intermediate_fm
+ op1.block_config = NpuShape3D(height=1, width=1, depth=4)
+ op2 = NpuElementWiseOperation(NpuElementWiseOp.SUB)
+ op2.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x39AC0, layout=NpuLayout.NHCWB16,)
+ op2.ifm2 = intermediate_fm
+ op2.ofm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xE0, layout=NpuLayout.NHCWB16,)
+ op2.block_config = NpuShape3D(height=1, width=1, depth=4)
+ arch = create_default_arch(Accelerator.Ethos_U55_128)
+ block_dep = calc_blockdep(arch, op1, op2)
+ assert block_dep == 0
+
+
+def test_calc_blockdep2():
+ """
+ Tests blockdep calculation, op1 produces part of the input of op2,
+ op1 and op2 have different sizes.
+ op2 takes 3 blocks to complete, op1's last block collides with op2's last block
+ which results in blockdep 2
+ """
+ op1 = NpuConv2DOperation()
+ op1.ifm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,)
+ op1.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,)
+ op1.kernel = NpuKernel(1, 1)
+ op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=208)]
+ op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=160)]
+ op1.padding = NpuPadding(top=0, left=0, right=0, bottom=0)
+ op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
+ op1.block_config = NpuShape3D(height=4, width=6, depth=16)
+ op2 = NpuConvDepthWiseOperation()
+ op2.ifm = create_feature_map(NpuShape3D(height=3, width=48, depth=16), 1, 0, layout=NpuLayout.NHCWB16,)
+ # op2 has two tiles, the lower tile is produced by op1
+ op2.ifm.tiles = NpuTileBox(height_0=2, height_1=2, width_0=48, addresses=[0x7680, 0, 0x6480, 0])
+ op2.ofm = create_feature_map(NpuShape3D(height=1, width=24, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,)
+ op2.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2)
+ op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=208)]
+ op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=160)]
+ op2.padding = NpuPadding(top=0, left=0, right=0, bottom=0)
+ op2.block_config = NpuShape3D(height=1, width=8, depth=16)
+ arch = create_default_arch(Accelerator.Ethos_U55_128)
+ block_dep = calc_blockdep(arch, op1, op2)
+ assert block_dep == 2
+
+
+def test_calc_blockdep3():
+ """
+ Tests blockdep calculation, op2 consumes part of op1, op1 and op2 have different sizes.
+ There is no overlap between the last blocks of op1 and the first jobs of op2,
+ which results in blockdep 3
+ """
+ op1 = NpuConv2DOperation()
+ op1.ifm = create_feature_map(NpuShape3D(height=13, width=96, depth=1), 1, 0, layout=NpuLayout.NHWC,)
+ op1.ofm = create_feature_map(NpuShape3D(height=6, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,)
+ op1.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2)
+ op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=144)]
+ op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=80)]
+ op1.padding = NpuPadding(top=0, left=0, right=1, bottom=0)
+ op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
+ op1.block_config = NpuShape3D(height=6, width=3, depth=8)
+ op2 = NpuConvDepthWiseOperation()
+ op2.ifm = create_feature_map(NpuShape3D(height=5, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,)
+ op2.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,)
+ op2.kernel = NpuKernel(3, 3)
+ op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=112)]
+ op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=80)]
+ op2.padding = NpuPadding(top=0, left=0, right=0, bottom=0)
+ op2.block_config = NpuShape3D(height=4, width=6, depth=8)
+ arch = create_default_arch(Accelerator.Ethos_U55_128)
+ block_dep = calc_blockdep(arch, op1, op2)
+ assert block_dep == 3