From 17e53b5d776109e1bd1073c657ff0453ccf3c09e Mon Sep 17 00:00:00 2001 From: Rickard Bolin Date: Tue, 6 Sep 2022 16:09:01 +0000 Subject: MLBEDSW-6927: Add ofm_stride_multiplier attribute to operation Allow sparse writing of OFM by multiplying H/W/C of the OFM with the values of ofm_stride_multiplier Signed-off-by: Rickard Bolin Change-Id: I65d742ad36ad3154e9914cdd22e2da928ad1f095 --- ethosu/vela/high_level_command_to_npu_op.py | 31 ++++++-- ethosu/vela/operation.py | 5 ++ ethosu/vela/tensor.py | 119 +++++++++++++++------------- 3 files changed, 94 insertions(+), 61 deletions(-) (limited to 'ethosu/vela') diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py index 18919431..6246b37e 100644 --- a/ethosu/vela/high_level_command_to_npu_op.py +++ b/ethosu/vela/high_level_command_to_npu_op.py @@ -347,7 +347,13 @@ def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]: return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point) -def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap: +def create_feature_map( + tens: Tensor, + box: Box, + arch: ArchitectureFeatures, + op_shape4D: Shape4D, + stride_multiplier: Optional[List[int]] = None, +) -> NpuFeatureMap: """Creates feature map with common fields populated""" fm = NpuFeatureMap() fm.region = get_region(tens.mem_type, arch) @@ -358,13 +364,25 @@ def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_sh fm.layout = NpuLayout.NHCWB16 else: assert 0, "Incorrect tensor format" + + strides = tens.get_strides(op_shape4D) + assert strides is not None + + if stride_multiplier and stride_multiplier != [1, 1, 1]: + assert ( + tens.format == TensorFormat.NHWC + ), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format" + # Multiply strides for C/H/W (in that order) with corresponding stride factor + for i, stride_factor in enumerate(stride_multiplier, start=1): + strides[i] *= stride_factor + height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer( - box.start_coord, box.end_coord, op_shape4D + box.start_coord, box.end_coord, strides, op_shape4D ) + fm.tiles = NpuTileBox( height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses] ) - strides = tens.get_strides(shape4D=op_shape4D) fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1])) fm.name = tens.name return fm @@ -462,7 +480,7 @@ def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: Archit npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor) out_block = cmd.ofm_box.get_block() - npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0]) + npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.ofm_stride_multiplier) npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth) npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor) @@ -595,9 +613,8 @@ def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation: src_addr = cmd.in_tensor.address + weight_range.offset dest_addr = cmd.out_tensor.address else: - start_coord = cmd.box.start_coord - src_addr = cmd.in_tensor.address_for_coordinate(start_coord) - dest_addr = cmd.out_tensor.address_for_coordinate(start_coord) + src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord) + dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord) sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr src = NpuAddressRange(src_region, int(src_addr), int(sz)) dest = NpuAddressRange(dest_region, int(dest_addr), int(sz)) diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py index de68b1d7..e1622049 100644 --- a/ethosu/vela/operation.py +++ b/ethosu/vela/operation.py @@ -501,6 +501,7 @@ class Operation: "write_offset", "write_shape", "ifm_resampling_mode", + "ofm_stride_multiplier", ) def __init__(self, op_type: Op, name: str): @@ -545,6 +546,9 @@ class Operation: # write_offset 0,9,0,0, write_shape 1,1,8,1 self.write_shape: Optional[Shape4D] = None self.ifm_resampling_mode: resampling_mode = resampling_mode.NONE + # For interleaved/sparse outputs - stride is multiplied with the stride factor of the corresponding axis + # Order is [C, H, W] - default is no multiplication + self.ofm_stride_multiplier: List[int] = [1, 1, 1] def clone(self, suffix="_clone"): res = Operation(self.type, self.name + suffix) @@ -568,6 +572,7 @@ class Operation: res.low_precision_scaling = self.low_precision_scaling res.rescale = self.rescale res.ifm_resampling_mode = self.ifm_resampling_mode + res.ofm_stride_multiplier = self.ofm_stride_multiplier.copy() return res diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 65473b8d..99970317 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -592,7 +592,9 @@ class Tensor: rounding_quantum = full_shape(4, list(self.storage_rounding_quantum), 1) return Shape4D(shape_round_to_quantum(op_shape4D.as_list(), rounding_quantum)) - def addresses_for_rolling_buffer(self, start_coord: Shape, end_coord: Shape, op_shape4D: Shape4D) -> Tuple: + def addresses_for_rolling_buffer( + self, start_coord: Shape, end_coord: Shape, strides: List[int], op_shape4D: Shape4D + ) -> Tuple: # returns ( box_height0, box_height1, box_width, [address_tl, address_tr, address_bl, address_br] ) if self.storage_shape == []: @@ -600,7 +602,7 @@ class Tensor: 1, 1, 1, - [self.address_for_coordinate(start_coord, op_shape4D=op_shape4D), 0, 0, 0], + [self.address_for_coordinate(start_coord, strides, op_shape4D), 0, 0, 0], ) if self.is_standard_fm: @@ -618,89 +620,90 @@ class Tensor: box_width = crossing_x - start_coord[2] addresses: List = [0] * 4 - addresses[0] = self.address_for_coordinate(start_coord, op_shape4D=op_shape4D) + addresses[0] = self.address_for_coordinate(start_coord, strides, op_shape4D) if end_coord[2] > crossing_x: addresses[1] = self.address_for_coordinate( - [start_coord[0], start_coord[1], crossing_x, start_coord[3]], op_shape4D=op_shape4D + [start_coord[0], start_coord[1], crossing_x, start_coord[3]], strides, op_shape4D ) raise UnsupportedFeatureError("Striping in vertical direction is not supported") if end_coord[1] > crossing_y: addresses[2] = self.address_for_coordinate( - [start_coord[0], crossing_y, start_coord[2], start_coord[3]], op_shape4D=op_shape4D + [start_coord[0], crossing_y, start_coord[2], start_coord[3]], strides, op_shape4D ) if end_coord[1] > crossing_y and end_coord[2] > crossing_x: addresses[3] = self.address_for_coordinate( - [start_coord[0], crossing_y, crossing_x, start_coord[3]], op_shape4D=op_shape4D + [start_coord[0], crossing_y, crossing_x, start_coord[3]], strides, op_shape4D ) return box_height0, box_height0, box_width, addresses - def address_for_coordinate(self, coord: Shape, is_top_box: bool = False, op_shape4D: Shape4D = None) -> int: - offset = self.address_offset_for_coordinate(coord, op_shape4D=op_shape4D, is_top_box=is_top_box) - assert offset is not None - return self.address + offset + def get_strides(self, shape4D: Optional[Shape4D]) -> List[int]: - def get_strides_and_coord( - self, coord: Optional[Shape] = None, shape4D: Optional[Shape4D] = None - ) -> Tuple[Optional[Shape], Optional[Shape]]: - if coord is None: - coord = [0] * min(len(self.storage_shape), 4) + augmented_shape = self.get_augmented_shape(shape4D) + assert len(augmented_shape) == 5 + strides: List = [0] * len(augmented_shape) + stride = self.element_size() * self.storage_compression_scale + + if self.format != TensorFormat.NHCWB16: + stride_order = [4, 1, 3, 2, 0] + for i in stride_order: + strides[i] = stride + stride *= augmented_shape[i] + else: + assert len(strides) == 5 + strides[4] = stride + strides[3] = 16 * stride # STRIDE_X + strides[1] = strides[3] * augmented_shape[2] # STRIDE_C + strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y + strides[0] = strides[2] * augmented_shape[1] # STRIDE_N + + return strides + + def get_augmented_shape(self, shape4D: Optional[Shape4D] = None) -> Optional[Shape]: if shape4D and self.is_standard_fm: augmented_shape = self.get_4D_storage_shape_for_shape(shape4D).as_list() else: augmented_shape = full_shape(4, self.storage_shape, 1) - augmented_coord = coord + if self.format == TensorFormat.NHWC: + augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1] + + elif self.format == TensorFormat.NHCWB16: + augmented_shape = augmented_shape[0:4] + [1] + + if augmented_shape[1] == 0: + augmented_shape[1] = 1 - while len(augmented_coord) < 4: - augmented_coord = [0] + augmented_coord + else: + assert self.format in (TensorFormat.Unknown, TensorFormat.WeightsCompressed) + return None - assert len(augmented_coord) == len(augmented_shape) + return augmented_shape + + def get_augmented_coord(self, coord: Optional[Shape] = None) -> Optional[Shape]: + if coord is None: + coord = [0] * min(len(self.storage_shape), 4) + + missing_len = 4 - len(coord) + augmented_coord = ([0] * missing_len) + coord if self.format == TensorFormat.NHWC: - augmented_shape = [augmented_shape[0], augmented_shape[3]] + augmented_shape[1:3] + [1] augmented_coord = [augmented_coord[0], augmented_coord[3]] + augmented_coord[1:3] + [0] elif self.format == TensorFormat.NHCWB16: channel_divisor = 16 - augmented_shape = augmented_shape[0:4] + [1] augmented_coord = ( [augmented_coord[0], augmented_coord[3] // channel_divisor] + augmented_coord[1:3] + [augmented_coord[3] % channel_divisor] ) - - if augmented_shape[1] == 0: - augmented_shape[1] = 1 - else: assert self.format in (TensorFormat.Unknown, TensorFormat.WeightsCompressed) - return None, None - - strides: List = [0] * len(augmented_shape) - stride = self.element_size() * self.storage_compression_scale - - if self.format != TensorFormat.NHCWB16: - stride_order = [4, 1, 3, 2, 0] - for i in stride_order: - strides[i] = stride - stride *= augmented_shape[i] - else: - assert len(strides) == 5 - strides[4] = stride - strides[3] = 16 * stride # STRIDE_X - strides[1] = strides[3] * augmented_shape[2] # STRIDE_C - strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y - strides[0] = strides[2] * augmented_shape[1] # STRIDE_N - - return strides, augmented_coord + return None - def get_strides(self, shape4D: Optional[Shape4D] = None) -> Shape: - strides, _ = self.get_strides_and_coord(shape4D=shape4D) - assert strides is not None - return strides + return augmented_coord def find_npu_op(self) -> Optional[Operation]: # Returns the NPU operator that uses this tensor @@ -743,8 +746,12 @@ class Tensor: assert 0 <= index < len(self.compressed_values) return index == len(self.compressed_values) - 1 - def address_offset_for_coordinate( - self, orig_coord: Shape, op_shape4D: Optional[Shape4D] = None, is_top_box: bool = False + def address_for_coordinate( + self, + orig_coord: Shape, + strides: Optional[List[int]] = None, + op_shape4D: Optional[Shape4D] = None, + is_top_box: bool = False, ) -> Optional[int]: address_offset = 0 assert self.purpose != TensorPurpose.Weights @@ -771,18 +778,22 @@ class Tensor: # handle wraparound for partial buffers. make sure to do this after subtracting top box: coord = [c % storage_shape[idx] for idx, c in enumerate(coord)] - strides, augmented_coord = self.get_strides_and_coord(coord, op_shape4D) - if strides is None: - return None + # Strides may be passed as an argument, for example when creating feature maps as the strides may be modified + # by the "ofm_stride_multiplier" operation attribute. If not, they are calculated here. + if not strides: + strides = self.get_strides(op_shape4D) if is_top_box: address_offset += 1 * strides[-1] # one element + augmented_coord = self.get_augmented_coord(coord) + assert augmented_coord is not None + address_offset += np.dot(augmented_coord, strides) assert address_offset >= 0 assert address_offset <= storage_size - return address_offset + return self.address + address_offset def is_allocated_in_tensor_arena(self, scratch_tensor_mem_area: MemArea) -> bool: return (self.mem_area == scratch_tensor_mem_area) and (self.mem_type in (MemType.Scratch, MemType.Scratch_fast)) -- cgit v1.2.1