aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/test
diff options
context:
space:
mode:
authorTim Hall <tim.hall@arm.com>2021-05-27 18:49:40 +0100
committerTim Hall <tim.hall@arm.com>2021-05-27 18:57:39 +0100
commitd8339a75c9b655c0507e34238078fdad068b4023 (patch)
tree36a14726b30760169a83c0356803b480992fade8 /ethosu/vela/test
parent64556f32ff7bfca6036a6598034464b13b64a4ef (diff)
downloadethos-u-vela-d8339a75c9b655c0507e34238078fdad068b4023.tar.gz
MLBEDSW-4034: New Scheduler Size or Performance Optimisation
- Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b Signed-off-by: Tim Hall <tim.hall@arm.com> Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
Diffstat (limited to 'ethosu/vela/test')
-rw-r--r--ethosu/vela/test/extapi/test_extapi_generate_commands.py16
-rw-r--r--ethosu/vela/test/test_architecture_allocator.py123
-rw-r--r--ethosu/vela/test/test_lut.py63
-rw-r--r--ethosu/vela/test/test_new_performance.py78
4 files changed, 244 insertions, 36 deletions
diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
index 3c9a43db..ee134300 100644
--- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py
+++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
@@ -167,11 +167,13 @@ def test_conv2d():
check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 15)
check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3)
check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 15)
- check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 14)
- check_cmd0(cmds, cmd0.NPU_SET_AB_START, 14)
check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
+ ib_end = find_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END)
+ ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START)
+ assert ib_end > 0
+ assert ib_end <= ab_start
def create_fully_connected_op() -> NpuConv2DOperation:
@@ -296,11 +298,13 @@ def test_mul_with_broadcast_and_relu():
check_cmd0(cmds, cmd0.NPU_SET_IFM2_PRECISION, 0)
check_cmd0(cmds, cmd0.NPU_SET_IFM2_BROADCAST, 5)
check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 16)
- check_cmd0(cmds, cmd0.NPU_SET_AB_START, 16)
- check_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START, 9)
check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0)
+ ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START)
+ assert ab_start > 0
+ ifm2_ib_start = find_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START)
+ assert 0 < ifm2_ib_start < ab_start
# Check that block width/height were generated that fit
blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
@@ -413,11 +417,11 @@ def test_check_sram_limit_spilling():
w, h = op.ofm.shape.width, op.ofm.shape.height
op.ofm.tiles = NpuTileBox(width_0=w, height_0=h, height_1=h, addresses=[32 * 1024, 0, 0, 0])
# 384K for spilling should fit
- arch.sram_size = 384 * 1024
+ arch.arena_cache_size = 384 * 1024
mem_limits = get_mem_limits_for_regions(arch)
generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits)
# 32K for spilling does not fit, due to the OFM address
- arch.sram_size = 32 * 1024
+ arch.arena_cache_size = 32 * 1024
mem_limits = get_mem_limits_for_regions(arch)
with pytest.raises(VelaError):
generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits)
diff --git a/ethosu/vela/test/test_architecture_allocator.py b/ethosu/vela/test/test_architecture_allocator.py
new file mode 100644
index 00000000..94768fc1
--- /dev/null
+++ b/ethosu/vela/test/test_architecture_allocator.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Unit tests for architecture_allocator.py
+import pytest
+
+from ethosu.vela.architecture_allocator import find_block_config
+from ethosu.vela.architecture_allocator import try_block_config
+from ethosu.vela.architecture_features import Accelerator
+from ethosu.vela.architecture_features import Block
+from ethosu.vela.architecture_features import create_default_arch
+from ethosu.vela.ethos_u55_regs.ethos_u55_regs import resampling_mode
+from ethosu.vela.operation import Kernel
+from ethosu.vela.operation import NpuBlockType
+from ethosu.vela.shape4d import Shape4D
+
+test_data = [
+ {
+ "block_type": NpuBlockType.ConvolutionDepthWise,
+ "kernel": Kernel(25, 5, 2, 2, 1, 1),
+ "ofm_shape": Shape4D(2, 11, 22),
+ "ifm_shape": Shape4D(27, 25, 22),
+ },
+ {
+ "block_type": NpuBlockType.Pooling,
+ "kernel": Kernel(2, 2),
+ "ofm_shape": Shape4D(53, 49, 22),
+ "ifm_shape": Shape4D(27, 25, 22),
+ "ifm_resampling": resampling_mode.NEAREST,
+ },
+ {
+ "block_type": NpuBlockType.ConvolutionMxN,
+ "accelerator": Accelerator.Ethos_U55_32,
+ "kernel": Kernel(2, 5),
+ "ofm_shape": Shape4D(48, 1, 17),
+ "ifm_shape": Shape4D(24, 5, 18),
+ "ifm_resampling": resampling_mode.TRANSPOSE,
+ },
+ {
+ "block_type": NpuBlockType.ElementWise,
+ "ofm_shape": Shape4D(27, 2, 22),
+ "ifm_shape": Shape4D(27, 2, 1),
+ "ifm2_shape": Shape4D(27, 25, 22),
+ },
+ {
+ "block_type": NpuBlockType.ElementWise,
+ "accelerator": Accelerator.Ethos_U55_32,
+ "ofm_shape": Shape4D(48, 37, 17),
+ "ifm_shape": Shape4D(48, 37, 17),
+ "uses_scalar": True,
+ "lut_banks": 2,
+ },
+ {
+ "block_type": NpuBlockType.ElementWise,
+ "ofm_shape": Shape4D(27, 2, 22),
+ "ifm_shape": Shape4D(27, 2, 22),
+ "ifm_bits": 16,
+ },
+]
+
+
+@pytest.mark.parametrize("test_data", test_data)
+def test_allocate(test_data):
+ """Tests that find_block_config and try_block_config produce consistent SHRAM layouts"""
+ accelerator = test_data.get("accelerator", Accelerator.Ethos_U55_128)
+ arch = create_default_arch(accelerator)
+ kernel = test_data.get("kernel", Kernel(1, 1))
+ block_type = test_data["block_type"]
+ ofm_shape = test_data["ofm_shape"]
+ ifm_shape = test_data["ifm_shape"]
+ ifm2_shape = test_data.get("ifm2_shape")
+ uses_scalar = test_data.get("uses_scalar", False)
+ ifm_bits = test_data.get("ifm_bits", 8)
+ ifm_resampling = test_data.get("ifm_resampling", resampling_mode.NONE)
+ scaled = test_data.get("scaled", True)
+ lut_banks = test_data.get("lut_banks", 0)
+ config = find_block_config(
+ arch,
+ block_type,
+ ofm_shape,
+ ifm_shape,
+ ifm2_shape,
+ uses_scalar=uses_scalar,
+ ifm_bits=ifm_bits,
+ kernel=kernel,
+ lut_banks=lut_banks,
+ scaled=scaled,
+ ifm_resampling=ifm_resampling,
+ )
+ assert config is not None
+ config2 = try_block_config(
+ Block.from_shape(config.ofm_block.as_list()),
+ arch,
+ block_type,
+ ifm_shape,
+ ifm2_shape,
+ is_partkernel=config.is_partkernel,
+ uses_scalar=uses_scalar,
+ ifm_bits=ifm_bits,
+ kernel=kernel,
+ lut_banks=lut_banks,
+ scaled=scaled,
+ ifm_resampling=ifm_resampling,
+ )
+ assert config2 is not None
+ assert config.layout.ib_end == config2.layout.ib_end
+ assert config.layout.ab_start == config2.layout.ab_start
+ assert config.layout.ib_start2 == config2.layout.ib_start2
+ assert config.acc_type == config2.acc_type
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 44ee0afb..4ddc8b95 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -19,7 +19,6 @@ import random
import numpy as np
-from ethosu.vela import insert_dma
from ethosu.vela import lut
from ethosu.vela import mark_tensors
from ethosu.vela import pass_packing
@@ -27,37 +26,41 @@ from ethosu.vela.data_type import DataType
from ethosu.vela.high_level_command_stream import DMA
from ethosu.vela.nn_graph import Graph
from ethosu.vela.operation import Op
+from ethosu.vela.rewrite_graph import rewrite_graph_pre_order
from ethosu.vela.rewrite_graph import verify_graph_health
from ethosu.vela.tensor import create_const_tensor
from ethosu.vela.tensor import TensorPurpose
from ethosu.vela.test import testutil
-def set_256_lut(op, key):
+def set_256_lut(op, key, arch):
random.seed(key)
values = random.choices(range(256), k=256)
lut_tensor = create_const_tensor(
op.name + "_lut", [1, 1, 1, 256], DataType.int8, values, np.uint8, TensorPurpose.LUT
)
- op.set_activation_lut(lut_tensor)
+ scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+ op.set_activation_lut(scratch_lut_tensor)
-def set_1K_lut(op, key):
+def set_1K_lut(op, key, arch):
random.seed(key)
values = random.choices(range(256), k=256)
lut_tensor = create_const_tensor(
op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, np.uint32, TensorPurpose.LUT
)
- op.set_activation_lut(lut_tensor)
+ scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+ op.set_activation_lut(scratch_lut_tensor)
-def set_2K_lut(op, key):
+def set_2K_lut(op, key, arch):
random.seed(key)
values = random.choices(range(512), k=512)
lut_tensor = create_const_tensor(
op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, np.uint32, TensorPurpose.LUT
)
- op.set_activation_lut(lut_tensor)
+ scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+ op.set_activation_lut(scratch_lut_tensor)
def process(arch, op_list):
@@ -68,16 +71,16 @@ def process(arch, op_list):
assert verify_graph_health(nng)
nng = mark_tensors.mark_tensor_purpose(nng, arch, False)
assert verify_graph_health(nng)
- nng = insert_dma.insert_dma_commands(nng, arch, False)
- assert verify_graph_health(nng)
+ rewrite_graph_pre_order(nng, sg, arch, [], [])
pass_packing.pack_into_passes(nng, arch, False)
assert verify_graph_health(nng)
# Create a DMA instruction for every op
cmd_list = []
for ps in sg.passes:
- for intermediate in ps.intermediates:
- if intermediate.needs_dma():
- cmd_list.append(DMA(ps, intermediate.get_dma_src_tensor(), intermediate, None))
+ for input_tens in ps.inputs:
+ if input_tens.src_tensor:
+ cmd_list.append(DMA(ps, input_tens.src_tensor, input_tens, None))
+
sg.high_level_command_stream = cmd_list
return sg
@@ -96,28 +99,28 @@ def test_optimize_high_level_cmd_stream_2K():
shape = [1, 1, 1, 1]
# u8 LUT op, should lead to DMA
op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape)
- set_256_lut(op0, "lut0")
+ set_256_lut(op0, "lut0", arch)
# u8 LUT op, should lead to DMA
op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape)
- set_256_lut(op1, "lut1")
+ set_256_lut(op1, "lut1", arch)
# u8 LUT op with different LUT, should lead to DMA
op2 = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape)
- set_256_lut(op2, "lut2")
+ set_256_lut(op2, "lut2", arch)
# u8 LUT op with same LUT as in op1, should not lead to DMA
op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape)
- set_256_lut(op3, "lut1")
+ set_256_lut(op3, "lut1", arch)
# u8 LUT op with same LUT as in op2, should not lead to DMA
op4 = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape)
- set_256_lut(op4, "lut2")
+ set_256_lut(op4, "lut2", arch)
# 2K LUT op, should lead to DMA, and will overwrite all previous LUTs in SHRAM
op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape)
- set_2K_lut(op5_2K, "lut5")
+ set_2K_lut(op5_2K, "lut5", arch)
# Another 2K LUT op, should lead to DMA, and will overwrite the previous LUT in SHRAM
op6_2K = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape)
- set_2K_lut(op6_2K, "lut6")
+ set_2K_lut(op6_2K, "lut6", arch)
# u8 LUT op with same LUT as in op1, should lead to DMA
op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape)
- set_256_lut(op7, "lut1")
+ set_256_lut(op7, "lut1", arch)
op_list = [op0, op1, op2, op3, op4, op5_2K, op6_2K, op7]
sg = process(arch, op_list)
@@ -132,7 +135,7 @@ def test_optimize_high_level_cmd_stream_2K():
orig_cmd_list = filter_lut_cmds(orig_cmd_list)
for (cmd, op) in zip(cmd_list, expected_dma_ops):
- assert cmd.in_tensor == op.activation_lut
+ assert cmd.in_tensor == op.activation_lut.src_tensor
# Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses
assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address
assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address
@@ -151,28 +154,28 @@ def test_optimize_high_level_cmd_stream_1K():
shape = [1, 1, 1, 1]
# u8 LUT op, should lead to DMA
op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape)
- set_256_lut(op0, "lut0")
+ set_256_lut(op0, "lut0", arch)
# u8 LUT op, should lead to DMA
op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape)
- set_256_lut(op1, "lut1")
+ set_256_lut(op1, "lut1", arch)
# 1K LUT op with different LUT, should lead to DMA
op2_1K = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape)
- set_1K_lut(op2_1K, "lut2")
+ set_1K_lut(op2_1K, "lut2", arch)
# u8 LUT op with same LUT as in op1, should not lead to DMA
op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape)
- set_256_lut(op3, "lut1")
+ set_256_lut(op3, "lut1", arch)
# 1K LUT op with same LUT as in op2, should not lead to DMA
op4_1K = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape)
- set_1K_lut(op4_1K, "lut2")
+ set_1K_lut(op4_1K, "lut2", arch)
# 1K LUT op, should lead to DMA, and will overwrite lut2
op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape)
- set_1K_lut(op5_2K, "lut5")
+ set_1K_lut(op5_2K, "lut5", arch)
# u8 LUT op, lut0 should still be present, should not lead to DMA
op6 = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape)
- set_256_lut(op6, "lut0")
+ set_256_lut(op6, "lut0", arch)
# 1K LUT op with same LUT as in op2, should lead to DMA
op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape)
- set_1K_lut(op7, "lut2")
+ set_1K_lut(op7, "lut2", arch)
op_list = [op0, op1, op2_1K, op3, op4_1K, op5_2K, op6, op7]
sg = process(arch, op_list)
@@ -187,7 +190,7 @@ def test_optimize_high_level_cmd_stream_1K():
# Check that only the needed DMA commands are left
expected_dma_ops = [op0, op1, op2_1K, op5_2K, op7]
for (cmd, op) in zip(cmd_list, expected_dma_ops):
- assert cmd.in_tensor == op.activation_lut
+ assert cmd.in_tensor == op.activation_lut.src_tensor
# Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses
assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address
assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address
diff --git a/ethosu/vela/test/test_new_performance.py b/ethosu/vela/test/test_new_performance.py
new file mode 100644
index 00000000..a35905b3
--- /dev/null
+++ b/ethosu/vela/test/test_new_performance.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Contains unit tests for new performance estimation code
+from ethosu.vela import architecture_allocator
+from ethosu.vela import architecture_features
+from ethosu.vela import npu_performance
+from ethosu.vela import operation
+from ethosu.vela.architecture_features import resampling_mode
+from ethosu.vela.shape4d import Shape4D
+from ethosu.vela.shape4d import VolumeIterator
+from ethosu.vela.tensor import MemArea
+
+
+def test_new_performance():
+ arch = architecture_features.create_default_arch(architecture_features.Accelerator.Ethos_U55_128)
+
+ query = npu_performance.PerformanceQuery(architecture_features.NpuBlockType.ConvolutionMxN)
+ query.ifm_shape = Shape4D(1, 16, 16, 16)
+ query.ifm2_shape = Shape4D()
+ query.ifm_memory_area = MemArea.Sram
+ query.ifm_bits = 8
+ query.ofm_shape = Shape4D(1, 16, 16, 1)
+ query.ofm_memory_area = MemArea.Sram
+ query.ofm_bits = 8
+ query.const_shape = Shape4D(1, 1, 1, query.ofm_shape.depth)
+ query.const_memory_area = MemArea.OffChipFlash
+ query.kernel = operation.Kernel(1, 1, 1, 1, 1, 1, valid_padding=False)
+ query.config = architecture_allocator.find_block_config(
+ arch,
+ architecture_features.NpuBlockType.ConvolutionMxN,
+ Shape4D(1, 16, 16, 1),
+ query.ifm_shape,
+ None,
+ False,
+ 8,
+ query.kernel,
+ 0,
+ False,
+ resampling_mode.NONE,
+ )
+
+ print("For block Config = {}".format(query.config))
+
+ # -s to display output
+ for sub_shape in [Shape4D(1, 4, 8, 16), Shape4D(1, 8, 8, 16), Shape4D(1, 8, 16, 16), query.ofm_shape]:
+ print("\n-- Subshape = {}".format(sub_shape))
+ iterator = VolumeIterator(query.ofm_shape, sub_shape)
+ a = npu_performance.ElementAccess()
+ c = npu_performance.CycleCost()
+ for pos, shape in iterator:
+ print("\tpos = {} shape = {}".format(pos, shape))
+ ta, tc = npu_performance.measure_performance_cost(
+ arch, operation.Op.Conv2D, operation.Op.Relu, query, pos, shape
+ )
+ a += ta
+ c += tc
+ print("\t\taccess: {}".format(ta))
+ print("\t\tcycles: {}".format(tc))
+ print("\tAccess: {}".format(a))
+ print("\tCycles: {}".format(c))
+ assert c.op_macs == 4096
+
+ assert True # Any successful result is okay