aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/high_level_command_to_npu_op.py
diff options
context:
space:
mode:
authorRickard Bolin <rickard.bolin@arm.com>2022-09-06 16:09:01 +0000
committerRickard Bolin <rickard.bolin@arm.com>2022-09-15 16:49:21 +0000
commit17e53b5d776109e1bd1073c657ff0453ccf3c09e (patch)
tree15342201d082be81eb1884ce8f0914c884cb9784 /ethosu/vela/high_level_command_to_npu_op.py
parent7f3ccd5500458de0b56f05ed99553360c46e6b41 (diff)
downloadethos-u-vela-17e53b5d776109e1bd1073c657ff0453ccf3c09e.tar.gz
MLBEDSW-6927: Add ofm_stride_multiplier attribute to operation
Allow sparse writing of OFM by multiplying H/W/C of the OFM with the values of ofm_stride_multiplier Signed-off-by: Rickard Bolin <rickard.bolin@arm.com> Change-Id: I65d742ad36ad3154e9914cdd22e2da928ad1f095
Diffstat (limited to 'ethosu/vela/high_level_command_to_npu_op.py')
-rw-r--r--ethosu/vela/high_level_command_to_npu_op.py31
1 files changed, 24 insertions, 7 deletions
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 18919431..6246b37e 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -347,7 +347,13 @@ def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
-def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
+def create_feature_map(
+ tens: Tensor,
+ box: Box,
+ arch: ArchitectureFeatures,
+ op_shape4D: Shape4D,
+ stride_multiplier: Optional[List[int]] = None,
+) -> NpuFeatureMap:
"""Creates feature map with common fields populated"""
fm = NpuFeatureMap()
fm.region = get_region(tens.mem_type, arch)
@@ -358,13 +364,25 @@ def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_sh
fm.layout = NpuLayout.NHCWB16
else:
assert 0, "Incorrect tensor format"
+
+ strides = tens.get_strides(op_shape4D)
+ assert strides is not None
+
+ if stride_multiplier and stride_multiplier != [1, 1, 1]:
+ assert (
+ tens.format == TensorFormat.NHWC
+ ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format"
+ # Multiply strides for C/H/W (in that order) with corresponding stride factor
+ for i, stride_factor in enumerate(stride_multiplier, start=1):
+ strides[i] *= stride_factor
+
height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
- box.start_coord, box.end_coord, op_shape4D
+ box.start_coord, box.end_coord, strides, op_shape4D
)
+
fm.tiles = NpuTileBox(
height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
)
- strides = tens.get_strides(shape4D=op_shape4D)
fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
fm.name = tens.name
return fm
@@ -462,7 +480,7 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit
npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
out_block = cmd.ofm_box.get_block()
- npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
+ npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.ofm_stride_multiplier)
npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
@@ -595,9 +613,8 @@ def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
src_addr = cmd.in_tensor.address + weight_range.offset
dest_addr = cmd.out_tensor.address
else:
- start_coord = cmd.box.start_coord
- src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
- dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+ src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord)
+ dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord)
sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
src = NpuAddressRange(src_region, int(src_addr), int(sz))
dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))