diff options
author | Louis Verhaard <louis.verhaard@arm.com> | 2020-11-20 13:08:55 +0100 |
---|---|---|
committer | Louis Verhaard <louis.verhaard@arm.com> | 2020-11-26 17:18:48 +0100 |
commit | d2665804871d76a16d5962952ba95500e3977c56 (patch) | |
tree | ea7fc78d7dae7f4258939cd2cfa8cffad92e566d | |
parent | 603016ccaa6cdb1a9b6d4547c561e4b45c90d3d5 (diff) | |
download | ethos-u-vela-d2665804871d76a16d5962952ba95500e3977c56.tar.gz |
MLBEDSW-3562: Improve blockdep calculation
Blockdep calculation can now handle different sized IFM/OFM.
Change-Id: I898a3c1c3a6778916802f3dbfa658328e5093096
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r-- | ethosu/vela/architecture_features.py | 23 | ||||
-rw-r--r-- | ethosu/vela/register_command_stream_generator.py | 134 | ||||
-rw-r--r-- | ethosu/vela/test/test_register_command_generator.py | 102 |
3 files changed, 213 insertions, 46 deletions
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py index 9f27b7ed..64005bf5 100644 --- a/ethosu/vela/architecture_features.py +++ b/ethosu/vela/architecture_features.py @@ -510,12 +510,9 @@ class ArchitectureFeatures: start_coord[1] + ifm_block.height, start_coord[2] + ifm_block.depth, ) - return (start_coord, end_coord, 1) # start, end, total jobs - def get_prev_job_output_volume( - self, ifm: Rect, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, block_offset - ): + def get_prev_job_output_volume(self, ofm: Rect, ofm_block: Block, block_offset): assert block_offset >= 0 # Get OFM block's volume coordinates @@ -527,28 +524,20 @@ class ArchitectureFeatures: start_coord[1] + ofm_block.height, start_coord[2] + ofm_block.depth, ) - - # Calculate how many IFM blocks this OFM block requires (i.e how many jobs) - ifm_depth_blocks = round_up_divide(ifm.size().depth, ifm_block_depth) - ifm_depth_blocks = 1 # Overwrite with 1 to force OFM block dependency, not IFM - - return (start_coord, end_coord, ifm_depth_blocks) # start, end, total jobs for this OFM block + return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block def calc_block_dep( self, - prev_ifm: Rect, prev_ofm: Rect, - prev_ifm_block_depth, prev_ofm_block: Block, - prev_kernel: Kernel, ifm: Rect, ofm: Rect, ifm_block_depth, ofm_block: Block, kernel: Kernel, padLT, + intersects, ): - blockdep = ArchitectureFeatures.MAX_BLOCKDEP # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window @@ -566,16 +555,14 @@ class ArchitectureFeatures: outstanding_jobs = 0 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): # This is the OFM block being generated by the previous op - out_area = self.get_prev_job_output_volume( - prev_ifm, prev_ofm, prev_ifm_block_depth, prev_ofm_block, prev_kernel, block_offset - ) + out_area = self.get_prev_job_output_volume(prev_ofm, prev_ofm_block, block_offset) if out_area is None: break # Block dependency is the max number of allowed outstanding jobs # in the pipeline. Selected by determining how many jobs occur # in between two operators' overlapping OFM->IFM block volumes - if ArchitectureFeatures.intersects(in_area[0], in_area[1], out_area[0], out_area[1]): + if intersects(in_area[0], in_area[1], out_area[0], out_area[1]): break # Early exit if no intersections and we've seen enough jobs in the pipeline elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP: diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 015a8c49..741b09c1 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -23,6 +23,7 @@ from enum import Enum from enum import IntEnum from typing import List from typing import Optional +from typing import Tuple import numpy as np @@ -745,6 +746,17 @@ def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool: ) +def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool: + """Checks if there is any address overlap between list1 and list2""" + for range1 in list1: + if range1 is None: + continue + for range2 in list2: + if range2 is not None and ranges_overlap(range1, range2): + return True + return False + + def get_strides(fm: NpuFeatureMap) -> NpuShape3D: """Calculates STRIDE_C/Y/X""" if fm.strides is not None: @@ -785,12 +797,63 @@ def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) def get_address_range( fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int ) -> NpuAddressRange: - """Gets address range for (y0, x0, c0) - (y1, x1, c1)""" + """ + Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm). + The begin and end coordinates must be within the same tile. + """ addr0 = get_address(fm, strides, y0, x0, c0) addr1 = get_address(fm, strides, y1, x1, c1) return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes()) +def get_h_ranges( + fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int +) -> List[NpuAddressRange]: + """ + Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm); + the begin and end coordinates must be within the same tile. + Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". + """ + return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)] + + +def get_address_ranges_for_area( + fm: NpuFeatureMap, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int +) -> List[NpuAddressRange]: + """ + Returns a list of adddress ranges that covers the area (y0, x0, c0) - (y1, x1, c1) (inclusive). + Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". + + For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return + 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF] + + .....|.... .....|.... + t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1 + ..XXX|XX.. ..BBB|DD.. + -----+---- --> -----+---- + t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3 + .....|.... .....|.... + """ + strides = get_strides(fm) + height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 + h, w, c = fm.shape + y2, x2, c2 = min(y1, h - 1), min(x1, w - 1), min(c1, c - 1) + ranges = [] + if x0 < width_0 and y0 < height_0: + # Horizontal ranges for tile 0 + ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y2, height_0 - 1), min(x2, width_0 - 1), c2)) + if x2 >= width_0 and y0 < height_1: + # Horizontal ranges for tile 1 + ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y2, height_1 - 1), x2, c2)) + if x0 < width_0 and y2 >= height_0: + # Horizontal ranges for tile 2 + ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y2, min(x2, width_0 - 1), c2)) + if x2 >= width_0 and y2 >= height_1: + # Horizontal ranges for tile 3 + ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y2, x2, c2)) + return ranges + + def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: """Returns 4 adddress ranges, one for every tile, None if the tile is not in use""" strides = get_strides(fm) @@ -806,7 +869,7 @@ def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: else: t2 = None if t1 is not None and t2 is not None: - t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1) + t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1) else: t3 = None return [t0, t1, t2, t3] @@ -934,22 +997,8 @@ def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark): # ------------------------------------------------------------------- -def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool: - """Checks if npu_op's input is dependent on prev_op's output""" - assert npu_op.ifm is not None - assert prev_op.ofm is not None - curr_input_ranges = get_address_ranges(npu_op.ifm) - - if has_ifm2(npu_op): - assert npu_op.ifm2 is not None - curr_input_ranges.extend(get_address_ranges(npu_op.ifm2)) - for prev_range in get_address_ranges(prev_op.ofm): - if prev_range is None: - continue - for curr_range in curr_input_ranges: - if curr_range is not None and ranges_overlap(prev_range, curr_range): - return True - return False +def shape3d_size(shape: NpuShape3D) -> int: + return shape.width * shape.height * shape.depth def shape3d_to_rect(shape: NpuShape3D) -> Rect: @@ -970,35 +1019,66 @@ def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperatio """Calculates the value of the BLOCKDEP register""" if prev_op is None: return 0 - if not is_dependent_on_prev_op(prev_op, npu_op): + assert npu_op.ifm is not None + assert prev_op.ofm is not None + # Check if IFM or IFM2 overlaps with prev op's OFM + prev_ofm_ranges = get_address_ranges(prev_op.ofm) + ifm_ranges = get_address_ranges(npu_op.ifm) + ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges) + if has_ifm2(npu_op): + assert npu_op.ifm2 is not None + ifm2_ranges = get_address_ranges(npu_op.ifm2) + ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges) + else: + ifm2_overlaps = False + if ifm_overlaps and ifm2_overlaps: + # Both IFM and IFM2 overlap (should be rare) + return 0 + if not ifm_overlaps and not ifm2_overlaps: + # No overlap between prev OFM and IFM/IFM2 return ArchitectureFeatures.MAX_BLOCKDEP - if prev_op.ofm.shape != npu_op.ifm.shape: + if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape): + # Prev OFM produces IFM2 which is broadcasted (this should be rare) return 0 prev_block_config = prev_op.block_config block_config = npu_op.block_config - prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op) + overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2 + assert overlapping_fm is not None + + def intersects(ifm_start_coord: Tuple, ifm_end_coord: Tuple, ofm_start_coord: Tuple, ofm_end_coord: Tuple) -> bool: + """Checks if the given IFM area overlaps with the given OFM area""" + if overlapping_fm.shape == prev_op.ofm.shape and overlapping_fm.tiles == prev_op.ofm.tiles: + # Common case: prev_op.ofm == op.ifm; in this case it suffices to check + # if the xyz coordinates overlap, which is quick and easy + return ArchitectureFeatures.intersects(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord) + # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM. + # In this case address comparison is needed between the two areas + x0, y0, c0 = ifm_start_coord + x1, y1, c1 = ifm_end_coord + ifm_ranges = get_address_ranges_for_area(overlapping_fm, y0, x0, c0, y1, x1, c1) + x0, y0, c0 = ofm_start_coord + x1, y1, c1 = ofm_end_coord + prev_ofm_ranges = get_address_ranges_for_area(prev_op.ofm, y0, x0, c0, y1, x1, c1) + return range_lists_overlap(ifm_ranges, prev_ofm_ranges) + prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth) prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape) - prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape) cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op) cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth) cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape) cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape) cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top) - blockdep = arch.calc_block_dep( - prev_ifm_rect, + return arch.calc_block_dep( prev_ofm_rect, - prev_ifm_block_depth, prev_ofm_block, - to_kernel(prev_op.kernel), cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, to_kernel(npu_op.kernel), cur_padLT, + intersects=intersects, ) - return blockdep # ------------------------------------------------------------------- diff --git a/ethosu/vela/test/test_register_command_generator.py b/ethosu/vela/test/test_register_command_generator.py index f2a16097..2760c860 100644 --- a/ethosu/vela/test/test_register_command_generator.py +++ b/ethosu/vela/test/test_register_command_generator.py @@ -17,13 +17,24 @@ # Description: # Contains unit tests for register command stream generator from ethosu.vela.api import NpuAddressRange +from ethosu.vela.api import NpuBlockTraversal +from ethosu.vela.api import NpuConv2DOperation +from ethosu.vela.api import NpuConvDepthWiseOperation from ethosu.vela.api import NpuDataType +from ethosu.vela.api import NpuElementWiseOp +from ethosu.vela.api import NpuElementWiseOperation from ethosu.vela.api import NpuFeatureMap +from ethosu.vela.api import NpuKernel from ethosu.vela.api import NpuLayout +from ethosu.vela.api import NpuPadding from ethosu.vela.api import NpuShape3D from ethosu.vela.api import NpuTileBox +from ethosu.vela.architecture_features import Accelerator +from ethosu.vela.architecture_features import create_default_arch +from ethosu.vela.register_command_stream_generator import calc_blockdep from ethosu.vela.register_command_stream_generator import get_address_ranges from ethosu.vela.register_command_stream_generator import get_strides +from ethosu.vela.test.extapi.test_extapi_generate_commands import create_feature_map def test_get_fm_strides(): @@ -39,6 +50,11 @@ def test_get_fm_strides(): assert get_strides(fm) == NpuShape3D(height=240, width=24, depth=1) +# ------------------------------------------------------------------- +# ADDRESS TESTS +# ------------------------------------------------------------------- + + def test_get_address_ranges_one_tile(): """Tests calculation of feature map address ranges, with 1 tile used""" fm = NpuFeatureMap() @@ -100,5 +116,89 @@ def test_get_address_ranges_4_tiles(): NpuAddressRange(region=6, address=16, length=18952), NpuAddressRange(region=6, address=32000, length=6280), NpuAddressRange(region=6, address=8000, length=12552), - NpuAddressRange(region=6, address=28800, length=12680), + NpuAddressRange(region=6, address=16000, length=25480), ] + + +# ------------------------------------------------------------------- +# BLOCKDEP TESTS +# ------------------------------------------------------------------- + + +def test_calc_blockdep0(): + """ + Tests blockdep calculation, op1 that produces op2's IFM2. + op2 takes 1 block to complete, which results in blockdep 0 + """ + op1 = NpuElementWiseOperation(NpuElementWiseOp.CLZ) + op1.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x60, layout=NpuLayout.NHCWB16,) + intermediate_fm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xA0, layout=NpuLayout.NHCWB16,) + op1.ofm = intermediate_fm + op1.block_config = NpuShape3D(height=1, width=1, depth=4) + op2 = NpuElementWiseOperation(NpuElementWiseOp.SUB) + op2.ifm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0x39AC0, layout=NpuLayout.NHCWB16,) + op2.ifm2 = intermediate_fm + op2.ofm = create_feature_map(NpuShape3D(height=1, width=1, depth=1), 1, 0xE0, layout=NpuLayout.NHCWB16,) + op2.block_config = NpuShape3D(height=1, width=1, depth=4) + arch = create_default_arch(Accelerator.Ethos_U55_128) + block_dep = calc_blockdep(arch, op1, op2) + assert block_dep == 0 + + +def test_calc_blockdep2(): + """ + Tests blockdep calculation, op1 produces part of the input of op2, + op1 and op2 have different sizes. + op2 takes 3 blocks to complete, op1's last block collides with op2's last block + which results in blockdep 2 + """ + op1 = NpuConv2DOperation() + op1.ifm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,) + op1.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,) + op1.kernel = NpuKernel(1, 1) + op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=208)] + op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=160)] + op1.padding = NpuPadding(top=0, left=0, right=0, bottom=0) + op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST + op1.block_config = NpuShape3D(height=4, width=6, depth=16) + op2 = NpuConvDepthWiseOperation() + op2.ifm = create_feature_map(NpuShape3D(height=3, width=48, depth=16), 1, 0, layout=NpuLayout.NHCWB16,) + # op2 has two tiles, the lower tile is produced by op1 + op2.ifm.tiles = NpuTileBox(height_0=2, height_1=2, width_0=48, addresses=[0x7680, 0, 0x6480, 0]) + op2.ofm = create_feature_map(NpuShape3D(height=1, width=24, depth=16), 1, 0x6480, layout=NpuLayout.NHCWB16,) + op2.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2) + op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=208)] + op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=160)] + op2.padding = NpuPadding(top=0, left=0, right=0, bottom=0) + op2.block_config = NpuShape3D(height=1, width=8, depth=16) + arch = create_default_arch(Accelerator.Ethos_U55_128) + block_dep = calc_blockdep(arch, op1, op2) + assert block_dep == 2 + + +def test_calc_blockdep3(): + """ + Tests blockdep calculation, op2 consumes part of op1, op1 and op2 have different sizes. + There is no overlap between the last blocks of op1 and the first jobs of op2, + which results in blockdep 3 + """ + op1 = NpuConv2DOperation() + op1.ifm = create_feature_map(NpuShape3D(height=13, width=96, depth=1), 1, 0, layout=NpuLayout.NHWC,) + op1.ofm = create_feature_map(NpuShape3D(height=6, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,) + op1.kernel = NpuKernel(3, 3, stride_x=2, stride_y=2) + op1.weights = [NpuAddressRange(region=1, address=0x4AE0, length=144)] + op1.biases = [NpuAddressRange(region=1, address=0x49A0, length=80)] + op1.padding = NpuPadding(top=0, left=0, right=1, bottom=0) + op1.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST + op1.block_config = NpuShape3D(height=6, width=3, depth=8) + op2 = NpuConvDepthWiseOperation() + op2.ifm = create_feature_map(NpuShape3D(height=5, width=48, depth=8), 1, 0x7C80, layout=NpuLayout.NHCWB16,) + op2.ofm = create_feature_map(NpuShape3D(height=4, width=48, depth=8), 1, 0x4C80, layout=NpuLayout.NHCWB16,) + op2.kernel = NpuKernel(3, 3) + op2.weights = [NpuAddressRange(region=1, address=0x4BB0, length=112)] + op2.biases = [NpuAddressRange(region=1, address=0x4A40, length=80)] + op2.padding = NpuPadding(top=0, left=0, right=0, bottom=0) + op2.block_config = NpuShape3D(height=4, width=6, depth=8) + arch = create_default_arch(Accelerator.Ethos_U55_128) + block_dep = calc_blockdep(arch, op1, op2) + assert block_dep == 3 |