aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRickard Bolin <rickard.bolin@arm.com>2022-09-06 16:09:01 +0000
committerRickard Bolin <rickard.bolin@arm.com>2022-09-15 16:49:21 +0000
commit17e53b5d776109e1bd1073c657ff0453ccf3c09e (patch)
tree15342201d082be81eb1884ce8f0914c884cb9784
parent7f3ccd5500458de0b56f05ed99553360c46e6b41 (diff)
downloadethos-u-vela-17e53b5d776109e1bd1073c657ff0453ccf3c09e.tar.gz
MLBEDSW-6927: Add ofm_stride_multiplier attribute to operation
Allow sparse writing of OFM by multiplying H/W/C of the OFM with the values of ofm_stride_multiplier Signed-off-by: Rickard Bolin <rickard.bolin@arm.com> Change-Id: I65d742ad36ad3154e9914cdd22e2da928ad1f095
-rw-r--r--ethosu/vela/high_level_command_to_npu_op.py31
-rw-r--r--ethosu/vela/operation.py5
-rw-r--r--ethosu/vela/tensor.py119
3 files changed, 94 insertions, 61 deletions
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 18919431..6246b37e 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -347,7 +347,13 @@ def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
-def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
+def create_feature_map(
+ tens: Tensor,
+ box: Box,
+ arch: ArchitectureFeatures,
+ op_shape4D: Shape4D,
+ stride_multiplier: Optional[List[int]] = None,
+) -> NpuFeatureMap:
"""Creates feature map with common fields populated"""
fm = NpuFeatureMap()
fm.region = get_region(tens.mem_type, arch)
@@ -358,13 +364,25 @@ def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_sh
fm.layout = NpuLayout.NHCWB16
else:
assert 0, "Incorrect tensor format"
+
+ strides = tens.get_strides(op_shape4D)
+ assert strides is not None
+
+ if stride_multiplier and stride_multiplier != [1, 1, 1]:
+ assert (
+ tens.format == TensorFormat.NHWC
+ ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format"
+ # Multiply strides for C/H/W (in that order) with corresponding stride factor
+ for i, stride_factor in enumerate(stride_multiplier, start=1):
+ strides[i] *= stride_factor
+
height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
- box.start_coord, box.end_coord, op_shape4D
+ box.start_coord, box.end_coord, strides, op_shape4D
)
+
fm.tiles = NpuTileBox(
height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
)
- strides = tens.get_strides(shape4D=op_shape4D)
fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
fm.name = tens.name
return fm
@@ -462,7 +480,7 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit
npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
out_block = cmd.ofm_box.get_block()
- npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
+ npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.ofm_stride_multiplier)
npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
@@ -595,9 +613,8 @@ def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
src_addr = cmd.in_tensor.address + weight_range.offset
dest_addr = cmd.out_tensor.address
else:
- start_coord = cmd.box.start_coord
- src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
- dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+ src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord)
+ dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord)
sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
src = NpuAddressRange(src_region, int(src_addr), int(sz))
dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index de68b1d7..e1622049 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -501,6 +501,7 @@ class Operation:
"write_offset",
"write_shape",
"ifm_resampling_mode",
+ "ofm_stride_multiplier",
)
def __init__(self, op_type: Op, name: str):
@@ -545,6 +546,9 @@ class Operation:
# write_offset 0,9,0,0, write_shape 1,1,8,1
self.write_shape: Optional[Shape4D] = None
self.ifm_resampling_mode: resampling_mode = resampling_mode.NONE
+ # For interleaved/sparse outputs - stride is multiplied with the stride factor of the corresponding axis
+ # Order is [C, H, W] - default is no multiplication
+ self.ofm_stride_multiplier: List[int] = [1, 1, 1]
def clone(self, suffix="_clone"):
res = Operation(self.type, self.name + suffix)
@@ -568,6 +572,7 @@ class Operation:
res.low_precision_scaling = self.low_precision_scaling
res.rescale = self.rescale
res.ifm_resampling_mode = self.ifm_resampling_mode
+ res.ofm_stride_multiplier = self.ofm_stride_multiplier.copy()
return res
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 65473b8d..99970317 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -592,7 +592,9 @@ class Tensor:
rounding_quantum = full_shape(4, list(self.storage_rounding_quantum), 1)
return Shape4D(shape_round_to_quantum(op_shape4D.as_list(), rounding_quantum))
- def addresses_for_rolling_buffer(self, start_coord: Shape, end_coord: Shape, op_shape4D: Shape4D) -> Tuple:
+ def addresses_for_rolling_buffer(
+ self, start_coord: Shape, end_coord: Shape, strides: List[int], op_shape4D: Shape4D
+ ) -> Tuple:
# returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] )
if self.storage_shape == []:
@@ -600,7 +602,7 @@ class Tensor:
1,
1,
1,
- [self.address_for_coordinate(start_coord, op_shape4D=op_shape4D), 0, 0, 0],
+ [self.address_for_coordinate(start_coord, strides, op_shape4D), 0, 0, 0],
)
if self.is_standard_fm:
@@ -618,89 +620,90 @@ class Tensor:
box_width = crossing_x - start_coord[2]
addresses: List = [0] * 4
- addresses[0] = self.address_for_coordinate(start_coord, op_shape4D=op_shape4D)
+ addresses[0] = self.address_for_coordinate(start_coord, strides, op_shape4D)
if end_coord[2] > crossing_x:
addresses[1] = self.address_for_coordinate(
- [start_coord[0], start_coord[1], crossing_x, start_coord[3]], op_shape4D=op_shape4D
+ [start_coord[0], start_coord[1], crossing_x, start_coord[3]], strides, op_shape4D
)
raise UnsupportedFeatureError("Striping in vertical direction is not supported")
if end_coord[1] > crossing_y:
addresses[2] = self.address_for_coordinate(
- [start_coord[0], crossing_y, start_coord[2], start_coord[3]], op_shape4D=op_shape4D
+ [start_coord[0], crossing_y, start_coord[2], start_coord[3]], strides, op_shape4D
)
if end_coord[1] > crossing_y and end_coord[2] > crossing_x:
addresses[3] = self.address_for_coordinate(
- [start_coord[0], crossing_y, crossing_x, start_coord[3]], op_shape4D=op_shape4D
+ [start_coord[0], crossing_y, crossing_x, start_coord[3]], strides, op_shape4D
)
return box_height0, box_height0, box_width, addresses
- def address_for_coordinate(self, coord: Shape, is_top_box: bool = False, op_shape4D: Shape4D = None) -> int:
- offset = self.address_offset_for_coordinate(coord, op_shape4D=op_shape4D, is_top_box=is_top_box)
- assert offset is not None
- return self.address + offset
+ def get_strides(self, shape4D: Optional[Shape4D]) -> List[int]:
- def get_strides_and_coord(
- self, coord: Optional[Shape] = None, shape4D: Optional[Shape4D] = None
- ) -> Tuple[Optional[Shape], Optional[Shape]]:
- if coord is None:
- coord = [0] * min(len(self.storage_shape), 4)
+ augmented_shape = self.get_augmented_shape(shape4D)
+ assert len(augmented_shape) == 5
+ strides: List = [0] * len(augmented_shape)
+ stride = self.element_size() * self.storage_compression_scale
+
+ if self.format != TensorFormat.NHCWB16:
+ stride_order = [4, 1, 3, 2, 0]
+ for i in stride_order:
+ strides[i] = stride
+ stride *= augmented_shape[i]
+ else:
+ assert len(strides) == 5
+ strides[4] = stride
+ strides[3] = 16 * stride # STRIDE_X
+ strides[1] = strides[3] * augmented_shape[2] # STRIDE_C
+ strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y
+ strides[0] = strides[2] * augmented_shape[1] # STRIDE_N
+
+ return strides
+
+ def get_augmented_shape(self, shape4D: Optional[Shape4D] = None) -> Optional[Shape]:
if shape4D and self.is_standard_fm:
augmented_shape = self.get_4D_storage_shape_for_shape(shape4D).as_list()
else:
augmented_shape = full_shape(4, self.storage_shape, 1)
- augmented_coord = coord
+ if self.format == TensorFormat.NHWC:
+ augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1]
+
+ elif self.format == TensorFormat.NHCWB16:
+ augmented_shape = augmented_shape[0:4] + [1]
+
+ if augmented_shape[1] == 0:
+ augmented_shape[1] = 1
- while len(augmented_coord) < 4:
- augmented_coord = [0] + augmented_coord
+ else:
+ assert self.format in (TensorFormat.Unknown, TensorFormat.WeightsCompressed)
+ return None
- assert len(augmented_coord) == len(augmented_shape)
+ return augmented_shape
+
+ def get_augmented_coord(self, coord: Optional[Shape] = None) -> Optional[Shape]:
+ if coord is None:
+ coord = [0] * min(len(self.storage_shape), 4)
+
+ missing_len = 4 - len(coord)
+ augmented_coord = ([0] * missing_len) + coord
if self.format == TensorFormat.NHWC:
- augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1]
augmented_coord = [augmented_coord[0], augmented_coord[3]] + augmented_coord[1:3] + [0]
elif self.format == TensorFormat.NHCWB16:
channel_divisor = 16
- augmented_shape = augmented_shape[0:4] + [1]
augmented_coord = (
[augmented_coord[0], augmented_coord[3] // channel_divisor]
+ augmented_coord[1:3]
+ [augmented_coord[3] % channel_divisor]
)
-
- if augmented_shape[1] == 0:
- augmented_shape[1] = 1
-
else:
assert self.format in (TensorFormat.Unknown, TensorFormat.WeightsCompressed)
- return None, None
-
- strides: List = [0] * len(augmented_shape)
- stride = self.element_size() * self.storage_compression_scale
-
- if self.format != TensorFormat.NHCWB16:
- stride_order = [4, 1, 3, 2, 0]
- for i in stride_order:
- strides[i] = stride
- stride *= augmented_shape[i]
- else:
- assert len(strides) == 5
- strides[4] = stride
- strides[3] = 16 * stride # STRIDE_X
- strides[1] = strides[3] * augmented_shape[2] # STRIDE_C
- strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y
- strides[0] = strides[2] * augmented_shape[1] # STRIDE_N
-
- return strides, augmented_coord
+ return None
- def get_strides(self, shape4D: Optional[Shape4D] = None) -> Shape:
- strides, _ = self.get_strides_and_coord(shape4D=shape4D)
- assert strides is not None
- return strides
+ return augmented_coord
def find_npu_op(self) -> Optional[Operation]:
# Returns the NPU operator that uses this tensor
@@ -743,8 +746,12 @@ class Tensor:
assert 0 <= index < len(self.compressed_values)
return index == len(self.compressed_values) - 1
- def address_offset_for_coordinate(
- self, orig_coord: Shape, op_shape4D: Optional[Shape4D] = None, is_top_box: bool = False
+ def address_for_coordinate(
+ self,
+ orig_coord: Shape,
+ strides: Optional[List[int]] = None,
+ op_shape4D: Optional[Shape4D] = None,
+ is_top_box: bool = False,
) -> Optional[int]:
address_offset = 0
assert self.purpose != TensorPurpose.Weights
@@ -771,18 +778,22 @@ class Tensor:
# handle wraparound for partial buffers. make sure to do this after subtracting top box:
coord = [c % storage_shape[idx] for idx, c in enumerate(coord)]
- strides, augmented_coord = self.get_strides_and_coord(coord, op_shape4D)
- if strides is None:
- return None
+ # Strides may be passed as an argument, for example when creating feature maps as the strides may be modified
+ # by the "ofm_stride_multiplier" operation attribute. If not, they are calculated here.
+ if not strides:
+ strides = self.get_strides(op_shape4D)
if is_top_box:
address_offset += 1 * strides[-1] # one element
+ augmented_coord = self.get_augmented_coord(coord)
+ assert augmented_coord is not None
+
address_offset += np.dot(augmented_coord, strides)
assert address_offset >= 0
assert address_offset <= storage_size
- return address_offset
+ return self.address + address_offset
def is_allocated_in_tensor_arena(self, scratch_tensor_mem_area: MemArea) -> bool:
return (self.mem_area == scratch_tensor_mem_area) and (self.mem_type in (MemType.Scratch, MemType.Scratch_fast))