From 897cc14968e017b1f48f376f7f7cefc515c5fe88 Mon Sep 17 00:00:00 2001 From: Andreas Nevalainen Date: Wed, 28 Oct 2020 15:42:08 +0100 Subject: MLBEDSW-3222: Bias tensors in fast storage For IFM streamed cascades bias tensors are read several times. Moves these tensors to fast storage and add DMA commands. Change-Id: I630f6275986c1b5e3f126c925b11e22500fb1128 Signed-off-by: Andreas Nevalainen --- OPTIONS.md | 9 +++++ ethosu/vela/high_level_command_stream.py | 6 +++ ethosu/vela/high_level_command_stream_generator.py | 5 +++ ethosu/vela/npu_serialisation.py | 5 ++- ethosu/vela/register_command_stream_generator.py | 7 +++- ethosu/vela/scheduler.py | 45 ++++++++++++++++++++++ ethosu/vela/tensor.py | 9 +++-- ethosu/vela/vela.py | 4 ++ ethosu/vela/weight_compressor.py | 8 +++- 9 files changed, 90 insertions(+), 8 deletions(-) diff --git a/OPTIONS.md b/OPTIONS.md index a7b513c..9220151 100644 --- a/OPTIONS.md +++ b/OPTIONS.md @@ -72,6 +72,15 @@ system's SRAM this optimisation is required. vela network.tflite --cascading False ``` +### Keep scale placement + +Prevents scheduler from placing scale tensors for IFM streamed passes in SRAM +and keeps these in flash. + +```bash +vela network.tflite --keep-scale-placement +``` + ### Force Block Config Force a specific block configuration in the format HxWxC, where H, W, and C are diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py index b8a19f5..a5372d7 100644 --- a/ethosu/vela/high_level_command_stream.py +++ b/ethosu/vela/high_level_command_stream.py @@ -21,6 +21,7 @@ import numpy as np from .numeric_util import round_up_divide from .operation import NpuBlockType +from .operation import Op from .range_set import AccessDirection from .range_set import MemoryAccessSet from .range_set import MemoryRangeSet @@ -236,6 +237,11 @@ class NpuStripe(Command): ), AccessDirection.Read, ) + if self.scale_tensor is not None and self.scale_tensor.ops[0].type == Op.DMA: + res.add( + self.scale_tensor.get_address_ranges_for_coordinates([0], self.scale_tensor.shape), + AccessDirection.Read, + ) # Add read access to SHRAM by any LUT-s for tens in self.ps.intermediates: if tens.purpose == TensorPurpose.LUT and tens.mem_area == MemArea.Shram: diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index 01fab0e..871a048 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -238,6 +238,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id y_step = y_dim weight_box = None + scale_box = None for start in range(y_start, y_dim, y_step): end = min(start + y_step, y_dim) @@ -299,6 +300,10 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id if ifm_y_present >= ifm_y_needed: break + if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None: + scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape)) + yield from dma_if_necessary(ps, scale_box, scale_tensor) + if weight_tensor is not None and weight_box is None: weight_box = Box.make_weight_box( weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py index 0bd0300..04534cc 100644 --- a/ethosu/vela/npu_serialisation.py +++ b/ethosu/vela/npu_serialisation.py @@ -128,7 +128,10 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas else: copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor) - copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) + if ps.scale_tensor.ops[0].type == Op.DMA: + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor.ops[0].inputs[0]) + else: + copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor) if ps.lut_tensor is not None: copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.lut_tensor) diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py index 0abd882..e5e4fb1 100644 --- a/ethosu/vela/register_command_stream_generator.py +++ b/ethosu/vela/register_command_stream_generator.py @@ -388,8 +388,11 @@ def generate_register_command_stream(nng, sg, arch, verbose=False): dst_addr = cmd.out_tensor.address_for_coordinate(start_coord) if cmd.in_tensor.compressed_values is not None: - stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord) - sz = cmd.in_tensor.size_of_compressed_stream(stream_index) + if cmd.out_tensor.purpose == TensorPurpose.FSBias: + sz = cmd.in_tensor.storage_size() + else: + stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord) + sz = cmd.in_tensor.size_of_compressed_stream(stream_index) else: sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 56f4aaa..526cc0e 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -38,6 +38,7 @@ from .npu_performance import PassCycles from .numeric_util import full_shape from .operation import NpuBlockType from .operation import Op +from .operation import Operation from .shared_buffer_allocation import find_block_configs_suitable_for_pass_and_shared_buffer from .shared_buffer_allocation import shared_buffer_allocation_for_pass_and_block_config from .tensor import MemArea @@ -64,6 +65,7 @@ class SchedulerOptions: use_ifm_streaming=True, pareto_metric=ParetoMetric.BwCycMem, use_nhcwb16_between_cascaded_passes=True, + keep_scale_placement=False, ): self.use_cascading = use_cascading self.verbose_schedule = verbose_schedule @@ -71,6 +73,7 @@ class SchedulerOptions: self.use_ifm_streaming = use_ifm_streaming self.pareto_metric = pareto_metric self.use_nhcwb16_between_cascaded_passes = use_nhcwb16_between_cascaded_passes + self.keep_scale_placement = keep_scale_placement def __str__(self): return type(self).__name__ + ": " + str(self.__dict__) @@ -1022,6 +1025,45 @@ class DynamicProgrammingScheduler: # in use_fast_storage_for_feature_maps self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites + def move_scales_to_fast_storage(self, sg, arch): + # IFM streamed ops reads bias tensors several times, move these to fast storage + for cp in sg.cascaded_passes: + if cp.strategy == SchedulingStrategy.IfmStream: + for ps in cp.passes: + if ps.scale_tensor and (cp.sram_used + ps.scale_tensor.storage_size()) <= self.sram_limit: + tens = ps.scale_tensor + + # Find op using scale tensor + op = next((op for op in ps.ops if tens in op.inputs), None) + assert op + + # Create fast storage tensor + new_tens = tens.clone_into_fast_storage(arch) + new_tens.consumer_list = tens.consumer_list.copy() + new_tens.purpose = TensorPurpose.FSBias + + # Create DMA cmd + dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma") + dma_cmd.inputs = [tens] + dma_cmd.set_output_tensor(new_tens) + dma_cmd.attrs["source"] = tens.mem_area + dma_cmd.attrs["destination"] = new_tens.mem_area + dma_cmd.run_on_npu = True + + tens.consumer_list.clear() + tens.consumer_list.append(dma_cmd) + + # Replace tensor and op + idx = op.inputs.index(tens) + op.inputs[idx] = new_tens + + ps.ops.insert(0, dma_cmd) + ps.scale_tensor = new_tens + ps.intermediates.append(new_tens) + ps.cascade.intermediates.append(new_tens) + + cp.sram_used += tens.storage_size() + def schedule_passes(nng, arch, options: SchedulerOptions): @@ -1041,6 +1083,9 @@ def schedule_passes(nng, arch, options: SchedulerOptions): dps.apply_result(strat_set, arch) + if not options.keep_scale_placement: + dps.move_scales_to_fast_storage(sg, arch) + if options.verbose_schedule: sg.print_cascaded_passes() diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py index 49f93cd..45518b4 100644 --- a/ethosu/vela/tensor.py +++ b/ethosu/vela/tensor.py @@ -81,16 +81,17 @@ class TensorPurpose(enum.IntFlag): FeatureMap = 2 Scratch = 3 LUT = 4 - Size = 5 + FSBias = 5 + Size = 6 def display_name(self): - return ("Unknown", "Weights", "FeatureMap", "Scratch", "LUT", "Size")[self.value] + return ("Unknown", "Weights", "FeatureMap", "Scratch", "LUT", "FastStorageBias", "Size")[self.value] def identifier_name(self): - return ("unknown", "weights", "feature_map", "scratch", "lut", "size")[self.value] + return ("unknown", "weights", "feature_map", "scratch", "lut", "fast_storage_bias", "size")[self.value] def all(): - return (TensorPurpose.Weights, TensorPurpose.FeatureMap) + return (TensorPurpose.Weights, TensorPurpose.FeatureMap, TensorPurpose.FSBias) class TensorSubPurpose(enum.Enum): diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 6d54187..4b43751 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -151,6 +151,9 @@ def main(args=None): parser.add_argument( "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU" ) + parser.add_argument( + "--keep-scale-placement", action="store_true", help="Keep scale tensors memory placement during scheduling" + ) parser.add_argument( "--cascading", type=ast.literal_eval, @@ -311,6 +314,7 @@ def main(args=None): use_ifm_streaming=args.ifm_streaming, pareto_metric=args.pareto_metric, use_nhcwb16_between_cascaded_passes=args.nhcwb16_between_cascaded_passes, + keep_scale_placement=args.keep_scale_placement, ) model_reader_options = model_reader.ModelReaderOptions() diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index 9453521..b0187b6 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -404,7 +404,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False): - assert tens.purpose == TensorPurpose.FeatureMap + assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias] assert tens.format == TensorFormat.NHWC # the connected operator should expect a bias input unless it is a FullyConnected assert tens.consumer_list[0].type.needs_bias() @@ -531,3 +531,9 @@ def update_pass_weight_and_scale_tensors(nng, arch): if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise): rescale_for_faf = True calc_scales_and_pack_biases(ps.scale_tensor, arch, ofm_depth_step, rescale_for_faf) + if ps.scale_tensor.ops[0].type == Op.DMA: + src_tens = ps.scale_tensor.get_dma_src_tensor() + src_tens.shape = ps.scale_tensor.shape + src_tens.quant_values = ps.scale_tensor.quant_values + src_tens.element_size_bytes = ps.scale_tensor.element_size_bytes + src_tens.copy_compressed_weight_info(ps.scale_tensor) -- cgit v1.2.1