From 8f78ac2ff735b7c0be7787d6423eb96a0d8b5983 Mon Sep 17 00:00:00 2001 From: Dwight Lidman Date: Fri, 13 Aug 2021 14:04:30 +0200 Subject: MLBEDSW-4803: Output diff fix for MobileNetV3 This commit moves a piece of code back into a loop but with a flag to make sure that the code is only executed once per loop rather than potentially every iteration. This solves the issue of an output diff because of LUT DMAs occurring before weight DMAs. Signed-off-by: Dwight Lidman Change-Id: I3e597f0a955154af3d87febacea1b3920d53b7c2 --- ethosu/vela/high_level_command_stream_generator.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py index b3ea9d49..3d0a1e58 100644 --- a/ethosu/vela/high_level_command_stream_generator.py +++ b/ethosu/vela/high_level_command_stream_generator.py @@ -130,11 +130,7 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule): for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width): end_width = min(start_width + ofm_step.width, ofm_end.width) - if parent_op.activation_lut: - lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] - lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape)) - yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor) - + lut_dma_done = False for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]): start_channel = max(start_channel, ofm_start.depth) end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth) @@ -203,6 +199,13 @@ def generate_high_level_commands_for_sched_op(sched_op, schedule): else: weight_box = None + # Should only be done once per loop but not before weights above + if parent_op.activation_lut and not lut_dma_done: + lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] + lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape)) + lut_dma_done = True + yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor) + yield NpuStripe( sched_op.parent_ps, block_config.old_style_representation(), -- cgit v1.2.1