diff options
author | Tim Hall <tim.hall@arm.com> | 2021-05-27 18:49:40 +0100 |
---|---|---|
committer | Tim Hall <tim.hall@arm.com> | 2021-05-27 18:57:39 +0100 |
commit | d8339a75c9b655c0507e34238078fdad068b4023 (patch) | |
tree | 36a14726b30760169a83c0356803b480992fade8 /ethosu/vela/test | |
parent | 64556f32ff7bfca6036a6598034464b13b64a4ef (diff) | |
download | ethos-u-vela-d8339a75c9b655c0507e34238078fdad068b4023.tar.gz |
MLBEDSW-4034: New Scheduler Size or Performance Optimisation
- Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b
Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
Diffstat (limited to 'ethosu/vela/test')
-rw-r--r-- | ethosu/vela/test/extapi/test_extapi_generate_commands.py | 16 | ||||
-rw-r--r-- | ethosu/vela/test/test_architecture_allocator.py | 123 | ||||
-rw-r--r-- | ethosu/vela/test/test_lut.py | 63 | ||||
-rw-r--r-- | ethosu/vela/test/test_new_performance.py | 78 |
4 files changed, 244 insertions, 36 deletions
diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py index 3c9a43db..ee134300 100644 --- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py +++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py @@ -167,11 +167,13 @@ def test_conv2d(): check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 15) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3) check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 15) - check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 14) - check_cmd0(cmds, cmd0.NPU_SET_AB_START, 14) check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0) check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0) check_cmd0(cmds, cmd0.NPU_OP_CONV, 0) + ib_end = find_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END) + ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START) + assert ib_end > 0 + assert ib_end <= ab_start def create_fully_connected_op() -> NpuConv2DOperation: @@ -296,11 +298,13 @@ def test_mul_with_broadcast_and_relu(): check_cmd0(cmds, cmd0.NPU_SET_IFM2_PRECISION, 0) check_cmd0(cmds, cmd0.NPU_SET_IFM2_BROADCAST, 5) check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 16) - check_cmd0(cmds, cmd0.NPU_SET_AB_START, 16) - check_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START, 9) check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0) check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0) check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0) + ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START) + assert ab_start > 0 + ifm2_ib_start = find_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START) + assert 0 < ifm2_ib_start < ab_start # Check that block width/height were generated that fit blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1) blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1) @@ -413,11 +417,11 @@ def test_check_sram_limit_spilling(): w, h = op.ofm.shape.width, op.ofm.shape.height op.ofm.tiles = NpuTileBox(width_0=w, height_0=h, height_1=h, addresses=[32 * 1024, 0, 0, 0]) # 384K for spilling should fit - arch.sram_size = 384 * 1024 + arch.arena_cache_size = 384 * 1024 mem_limits = get_mem_limits_for_regions(arch) generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits) # 32K for spilling does not fit, due to the OFM address - arch.sram_size = 32 * 1024 + arch.arena_cache_size = 32 * 1024 mem_limits = get_mem_limits_for_regions(arch) with pytest.raises(VelaError): generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits) diff --git a/ethosu/vela/test/test_architecture_allocator.py b/ethosu/vela/test/test_architecture_allocator.py new file mode 100644 index 00000000..94768fc1 --- /dev/null +++ b/ethosu/vela/test/test_architecture_allocator.py @@ -0,0 +1,123 @@ +# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Unit tests for architecture_allocator.py +import pytest + +from ethosu.vela.architecture_allocator import find_block_config +from ethosu.vela.architecture_allocator import try_block_config +from ethosu.vela.architecture_features import Accelerator +from ethosu.vela.architecture_features import Block +from ethosu.vela.architecture_features import create_default_arch +from ethosu.vela.ethos_u55_regs.ethos_u55_regs import resampling_mode +from ethosu.vela.operation import Kernel +from ethosu.vela.operation import NpuBlockType +from ethosu.vela.shape4d import Shape4D + +test_data = [ + { + "block_type": NpuBlockType.ConvolutionDepthWise, + "kernel": Kernel(25, 5, 2, 2, 1, 1), + "ofm_shape": Shape4D(2, 11, 22), + "ifm_shape": Shape4D(27, 25, 22), + }, + { + "block_type": NpuBlockType.Pooling, + "kernel": Kernel(2, 2), + "ofm_shape": Shape4D(53, 49, 22), + "ifm_shape": Shape4D(27, 25, 22), + "ifm_resampling": resampling_mode.NEAREST, + }, + { + "block_type": NpuBlockType.ConvolutionMxN, + "accelerator": Accelerator.Ethos_U55_32, + "kernel": Kernel(2, 5), + "ofm_shape": Shape4D(48, 1, 17), + "ifm_shape": Shape4D(24, 5, 18), + "ifm_resampling": resampling_mode.TRANSPOSE, + }, + { + "block_type": NpuBlockType.ElementWise, + "ofm_shape": Shape4D(27, 2, 22), + "ifm_shape": Shape4D(27, 2, 1), + "ifm2_shape": Shape4D(27, 25, 22), + }, + { + "block_type": NpuBlockType.ElementWise, + "accelerator": Accelerator.Ethos_U55_32, + "ofm_shape": Shape4D(48, 37, 17), + "ifm_shape": Shape4D(48, 37, 17), + "uses_scalar": True, + "lut_banks": 2, + }, + { + "block_type": NpuBlockType.ElementWise, + "ofm_shape": Shape4D(27, 2, 22), + "ifm_shape": Shape4D(27, 2, 22), + "ifm_bits": 16, + }, +] + + +@pytest.mark.parametrize("test_data", test_data) +def test_allocate(test_data): + """Tests that find_block_config and try_block_config produce consistent SHRAM layouts""" + accelerator = test_data.get("accelerator", Accelerator.Ethos_U55_128) + arch = create_default_arch(accelerator) + kernel = test_data.get("kernel", Kernel(1, 1)) + block_type = test_data["block_type"] + ofm_shape = test_data["ofm_shape"] + ifm_shape = test_data["ifm_shape"] + ifm2_shape = test_data.get("ifm2_shape") + uses_scalar = test_data.get("uses_scalar", False) + ifm_bits = test_data.get("ifm_bits", 8) + ifm_resampling = test_data.get("ifm_resampling", resampling_mode.NONE) + scaled = test_data.get("scaled", True) + lut_banks = test_data.get("lut_banks", 0) + config = find_block_config( + arch, + block_type, + ofm_shape, + ifm_shape, + ifm2_shape, + uses_scalar=uses_scalar, + ifm_bits=ifm_bits, + kernel=kernel, + lut_banks=lut_banks, + scaled=scaled, + ifm_resampling=ifm_resampling, + ) + assert config is not None + config2 = try_block_config( + Block.from_shape(config.ofm_block.as_list()), + arch, + block_type, + ifm_shape, + ifm2_shape, + is_partkernel=config.is_partkernel, + uses_scalar=uses_scalar, + ifm_bits=ifm_bits, + kernel=kernel, + lut_banks=lut_banks, + scaled=scaled, + ifm_resampling=ifm_resampling, + ) + assert config2 is not None + assert config.layout.ib_end == config2.layout.ib_end + assert config.layout.ab_start == config2.layout.ab_start + assert config.layout.ib_start2 == config2.layout.ib_start2 + assert config.acc_type == config2.acc_type diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py index 44ee0afb..4ddc8b95 100644 --- a/ethosu/vela/test/test_lut.py +++ b/ethosu/vela/test/test_lut.py @@ -19,7 +19,6 @@ import random import numpy as np -from ethosu.vela import insert_dma from ethosu.vela import lut from ethosu.vela import mark_tensors from ethosu.vela import pass_packing @@ -27,37 +26,41 @@ from ethosu.vela.data_type import DataType from ethosu.vela.high_level_command_stream import DMA from ethosu.vela.nn_graph import Graph from ethosu.vela.operation import Op +from ethosu.vela.rewrite_graph import rewrite_graph_pre_order from ethosu.vela.rewrite_graph import verify_graph_health from ethosu.vela.tensor import create_const_tensor from ethosu.vela.tensor import TensorPurpose from ethosu.vela.test import testutil -def set_256_lut(op, key): +def set_256_lut(op, key, arch): random.seed(key) values = random.choices(range(256), k=256) lut_tensor = create_const_tensor( op.name + "_lut", [1, 1, 1, 256], DataType.int8, values, np.uint8, TensorPurpose.LUT ) - op.set_activation_lut(lut_tensor) + scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + op.set_activation_lut(scratch_lut_tensor) -def set_1K_lut(op, key): +def set_1K_lut(op, key, arch): random.seed(key) values = random.choices(range(256), k=256) lut_tensor = create_const_tensor( op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, np.uint32, TensorPurpose.LUT ) - op.set_activation_lut(lut_tensor) + scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + op.set_activation_lut(scratch_lut_tensor) -def set_2K_lut(op, key): +def set_2K_lut(op, key, arch): random.seed(key) values = random.choices(range(512), k=512) lut_tensor = create_const_tensor( op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, np.uint32, TensorPurpose.LUT ) - op.set_activation_lut(lut_tensor) + scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch) + op.set_activation_lut(scratch_lut_tensor) def process(arch, op_list): @@ -68,16 +71,16 @@ def process(arch, op_list): assert verify_graph_health(nng) nng = mark_tensors.mark_tensor_purpose(nng, arch, False) assert verify_graph_health(nng) - nng = insert_dma.insert_dma_commands(nng, arch, False) - assert verify_graph_health(nng) + rewrite_graph_pre_order(nng, sg, arch, [], []) pass_packing.pack_into_passes(nng, arch, False) assert verify_graph_health(nng) # Create a DMA instruction for every op cmd_list = [] for ps in sg.passes: - for intermediate in ps.intermediates: - if intermediate.needs_dma(): - cmd_list.append(DMA(ps, intermediate.get_dma_src_tensor(), intermediate, None)) + for input_tens in ps.inputs: + if input_tens.src_tensor: + cmd_list.append(DMA(ps, input_tens.src_tensor, input_tens, None)) + sg.high_level_command_stream = cmd_list return sg @@ -96,28 +99,28 @@ def test_optimize_high_level_cmd_stream_2K(): shape = [1, 1, 1, 1] # u8 LUT op, should lead to DMA op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape) - set_256_lut(op0, "lut0") + set_256_lut(op0, "lut0", arch) # u8 LUT op, should lead to DMA op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape) - set_256_lut(op1, "lut1") + set_256_lut(op1, "lut1", arch) # u8 LUT op with different LUT, should lead to DMA op2 = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape) - set_256_lut(op2, "lut2") + set_256_lut(op2, "lut2", arch) # u8 LUT op with same LUT as in op1, should not lead to DMA op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape) - set_256_lut(op3, "lut1") + set_256_lut(op3, "lut1", arch) # u8 LUT op with same LUT as in op2, should not lead to DMA op4 = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape) - set_256_lut(op4, "lut2") + set_256_lut(op4, "lut2", arch) # 2K LUT op, should lead to DMA, and will overwrite all previous LUTs in SHRAM op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape) - set_2K_lut(op5_2K, "lut5") + set_2K_lut(op5_2K, "lut5", arch) # Another 2K LUT op, should lead to DMA, and will overwrite the previous LUT in SHRAM op6_2K = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape) - set_2K_lut(op6_2K, "lut6") + set_2K_lut(op6_2K, "lut6", arch) # u8 LUT op with same LUT as in op1, should lead to DMA op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape) - set_256_lut(op7, "lut1") + set_256_lut(op7, "lut1", arch) op_list = [op0, op1, op2, op3, op4, op5_2K, op6_2K, op7] sg = process(arch, op_list) @@ -132,7 +135,7 @@ def test_optimize_high_level_cmd_stream_2K(): orig_cmd_list = filter_lut_cmds(orig_cmd_list) for (cmd, op) in zip(cmd_list, expected_dma_ops): - assert cmd.in_tensor == op.activation_lut + assert cmd.in_tensor == op.activation_lut.src_tensor # Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address @@ -151,28 +154,28 @@ def test_optimize_high_level_cmd_stream_1K(): shape = [1, 1, 1, 1] # u8 LUT op, should lead to DMA op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape) - set_256_lut(op0, "lut0") + set_256_lut(op0, "lut0", arch) # u8 LUT op, should lead to DMA op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape) - set_256_lut(op1, "lut1") + set_256_lut(op1, "lut1", arch) # 1K LUT op with different LUT, should lead to DMA op2_1K = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape) - set_1K_lut(op2_1K, "lut2") + set_1K_lut(op2_1K, "lut2", arch) # u8 LUT op with same LUT as in op1, should not lead to DMA op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape) - set_256_lut(op3, "lut1") + set_256_lut(op3, "lut1", arch) # 1K LUT op with same LUT as in op2, should not lead to DMA op4_1K = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape) - set_1K_lut(op4_1K, "lut2") + set_1K_lut(op4_1K, "lut2", arch) # 1K LUT op, should lead to DMA, and will overwrite lut2 op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape) - set_1K_lut(op5_2K, "lut5") + set_1K_lut(op5_2K, "lut5", arch) # u8 LUT op, lut0 should still be present, should not lead to DMA op6 = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape) - set_256_lut(op6, "lut0") + set_256_lut(op6, "lut0", arch) # 1K LUT op with same LUT as in op2, should lead to DMA op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape) - set_1K_lut(op7, "lut2") + set_1K_lut(op7, "lut2", arch) op_list = [op0, op1, op2_1K, op3, op4_1K, op5_2K, op6, op7] sg = process(arch, op_list) @@ -187,7 +190,7 @@ def test_optimize_high_level_cmd_stream_1K(): # Check that only the needed DMA commands are left expected_dma_ops = [op0, op1, op2_1K, op5_2K, op7] for (cmd, op) in zip(cmd_list, expected_dma_ops): - assert cmd.in_tensor == op.activation_lut + assert cmd.in_tensor == op.activation_lut.src_tensor # Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address diff --git a/ethosu/vela/test/test_new_performance.py b/ethosu/vela/test/test_new_performance.py new file mode 100644 index 00000000..a35905b3 --- /dev/null +++ b/ethosu/vela/test/test_new_performance.py @@ -0,0 +1,78 @@ +# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Contains unit tests for new performance estimation code +from ethosu.vela import architecture_allocator +from ethosu.vela import architecture_features +from ethosu.vela import npu_performance +from ethosu.vela import operation +from ethosu.vela.architecture_features import resampling_mode +from ethosu.vela.shape4d import Shape4D +from ethosu.vela.shape4d import VolumeIterator +from ethosu.vela.tensor import MemArea + + +def test_new_performance(): + arch = architecture_features.create_default_arch(architecture_features.Accelerator.Ethos_U55_128) + + query = npu_performance.PerformanceQuery(architecture_features.NpuBlockType.ConvolutionMxN) + query.ifm_shape = Shape4D(1, 16, 16, 16) + query.ifm2_shape = Shape4D() + query.ifm_memory_area = MemArea.Sram + query.ifm_bits = 8 + query.ofm_shape = Shape4D(1, 16, 16, 1) + query.ofm_memory_area = MemArea.Sram + query.ofm_bits = 8 + query.const_shape = Shape4D(1, 1, 1, query.ofm_shape.depth) + query.const_memory_area = MemArea.OffChipFlash + query.kernel = operation.Kernel(1, 1, 1, 1, 1, 1, valid_padding=False) + query.config = architecture_allocator.find_block_config( + arch, + architecture_features.NpuBlockType.ConvolutionMxN, + Shape4D(1, 16, 16, 1), + query.ifm_shape, + None, + False, + 8, + query.kernel, + 0, + False, + resampling_mode.NONE, + ) + + print("For block Config = {}".format(query.config)) + + # -s to display output + for sub_shape in [Shape4D(1, 4, 8, 16), Shape4D(1, 8, 8, 16), Shape4D(1, 8, 16, 16), query.ofm_shape]: + print("\n-- Subshape = {}".format(sub_shape)) + iterator = VolumeIterator(query.ofm_shape, sub_shape) + a = npu_performance.ElementAccess() + c = npu_performance.CycleCost() + for pos, shape in iterator: + print("\tpos = {} shape = {}".format(pos, shape)) + ta, tc = npu_performance.measure_performance_cost( + arch, operation.Op.Conv2D, operation.Op.Relu, query, pos, shape + ) + a += ta + c += tc + print("\t\taccess: {}".format(ta)) + print("\t\tcycles: {}".format(tc)) + print("\tAccess: {}".format(a)) + print("\tCycles: {}".format(c)) + assert c.op_macs == 4096 + + assert True # Any successful result is okay |