aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Nevalainen <andreas.nevalainen@arm.com>2020-10-28 15:42:08 +0100
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2020-11-11 08:34:16 +0000
commit897cc14968e017b1f48f376f7f7cefc515c5fe88 (patch)
tree27d17a59f1529c1ed0b1cc59e60438ee7f5d29d2
parent73320a48dfa711f5938b0e3d8e03b9858558b899 (diff)
downloadethos-u-vela-897cc14968e017b1f48f376f7f7cefc515c5fe88.tar.gz
MLBEDSW-3222: Bias tensors in fast storage
For IFM streamed cascades bias tensors are read several times. Moves these tensors to fast storage and add DMA commands. Change-Id: I630f6275986c1b5e3f126c925b11e22500fb1128 Signed-off-by: Andreas Nevalainen <andreas.nevalainen@arm.com>
-rw-r--r--OPTIONS.md9
-rw-r--r--ethosu/vela/high_level_command_stream.py6
-rw-r--r--ethosu/vela/high_level_command_stream_generator.py5
-rw-r--r--ethosu/vela/npu_serialisation.py5
-rw-r--r--ethosu/vela/register_command_stream_generator.py7
-rw-r--r--ethosu/vela/scheduler.py45
-rw-r--r--ethosu/vela/tensor.py9
-rw-r--r--ethosu/vela/vela.py4
-rw-r--r--ethosu/vela/weight_compressor.py8
9 files changed, 90 insertions, 8 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index a7b513c..9220151 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -72,6 +72,15 @@ system's SRAM this optimisation is required.
vela network.tflite --cascading False
```
+### Keep scale placement
+
+Prevents scheduler from placing scale tensors for IFM streamed passes in SRAM
+and keeps these in flash.
+
+```bash
+vela network.tflite --keep-scale-placement
+```
+
### Force Block Config
Force a specific block configuration in the format HxWxC, where H, W, and C are
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index b8a19f5..a5372d7 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -21,6 +21,7 @@ import numpy as np
from .numeric_util import round_up_divide
from .operation import NpuBlockType
+from .operation import Op
from .range_set import AccessDirection
from .range_set import MemoryAccessSet
from .range_set import MemoryRangeSet
@@ -236,6 +237,11 @@ class NpuStripe(Command):
),
AccessDirection.Read,
)
+ if self.scale_tensor is not None and self.scale_tensor.ops[0].type == Op.DMA:
+ res.add(
+ self.scale_tensor.get_address_ranges_for_coordinates([0], self.scale_tensor.shape),
+ AccessDirection.Read,
+ )
# Add read access to SHRAM by any LUT-s
for tens in self.ps.intermediates:
if tens.purpose == TensorPurpose.LUT and tens.mem_area == MemArea.Shram:
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 01fab0e..871a048 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -238,6 +238,7 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
y_step = y_dim
weight_box = None
+ scale_box = None
for start in range(y_start, y_dim, y_step):
end = min(start + y_step, y_dim)
@@ -299,6 +300,10 @@ def generate_high_level_command_stream_for_pass(strat, passes, block_configs, id
if ifm_y_present >= ifm_y_needed:
break
+ if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None:
+ scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape))
+ yield from dma_if_necessary(ps, scale_box, scale_tensor)
+
if weight_tensor is not None and weight_box is None:
weight_box = Box.make_weight_box(
weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index 0bd0300..04534cc 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -128,7 +128,10 @@ def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fas
else:
copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
- copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+ if ps.scale_tensor.ops[0].type == Op.DMA:
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor.ops[0].inputs[0])
+ else:
+ copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
if ps.lut_tensor is not None:
copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.lut_tensor)
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 0abd882..e5e4fb1 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -388,8 +388,11 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
dst_addr = cmd.out_tensor.address_for_coordinate(start_coord)
if cmd.in_tensor.compressed_values is not None:
- stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
- sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
+ if cmd.out_tensor.purpose == TensorPurpose.FSBias:
+ sz = cmd.in_tensor.storage_size()
+ else:
+ stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
+ sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
else:
sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 56f4aaa..526cc0e 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -38,6 +38,7 @@ from .npu_performance import PassCycles
from .numeric_util import full_shape
from .operation import NpuBlockType
from .operation import Op
+from .operation import Operation
from .shared_buffer_allocation import find_block_configs_suitable_for_pass_and_shared_buffer
from .shared_buffer_allocation import shared_buffer_allocation_for_pass_and_block_config
from .tensor import MemArea
@@ -64,6 +65,7 @@ class SchedulerOptions:
use_ifm_streaming=True,
pareto_metric=ParetoMetric.BwCycMem,
use_nhcwb16_between_cascaded_passes=True,
+ keep_scale_placement=False,
):
self.use_cascading = use_cascading
self.verbose_schedule = verbose_schedule
@@ -71,6 +73,7 @@ class SchedulerOptions:
self.use_ifm_streaming = use_ifm_streaming
self.pareto_metric = pareto_metric
self.use_nhcwb16_between_cascaded_passes = use_nhcwb16_between_cascaded_passes
+ self.keep_scale_placement = keep_scale_placement
def __str__(self):
return type(self).__name__ + ": " + str(self.__dict__)
@@ -1022,6 +1025,45 @@ class DynamicProgrammingScheduler:
# in use_fast_storage_for_feature_maps
self.sg.scheduling_info["feature_map_rewrites"] = fast_storage_tensor_rewrites
+ def move_scales_to_fast_storage(self, sg, arch):
+ # IFM streamed ops reads bias tensors several times, move these to fast storage
+ for cp in sg.cascaded_passes:
+ if cp.strategy == SchedulingStrategy.IfmStream:
+ for ps in cp.passes:
+ if ps.scale_tensor and (cp.sram_used + ps.scale_tensor.storage_size()) <= self.sram_limit:
+ tens = ps.scale_tensor
+
+ # Find op using scale tensor
+ op = next((op for op in ps.ops if tens in op.inputs), None)
+ assert op
+
+ # Create fast storage tensor
+ new_tens = tens.clone_into_fast_storage(arch)
+ new_tens.consumer_list = tens.consumer_list.copy()
+ new_tens.purpose = TensorPurpose.FSBias
+
+ # Create DMA cmd
+ dma_cmd = Operation(Op.DMA, tens.ops[0].name + "_dma")
+ dma_cmd.inputs = [tens]
+ dma_cmd.set_output_tensor(new_tens)
+ dma_cmd.attrs["source"] = tens.mem_area
+ dma_cmd.attrs["destination"] = new_tens.mem_area
+ dma_cmd.run_on_npu = True
+
+ tens.consumer_list.clear()
+ tens.consumer_list.append(dma_cmd)
+
+ # Replace tensor and op
+ idx = op.inputs.index(tens)
+ op.inputs[idx] = new_tens
+
+ ps.ops.insert(0, dma_cmd)
+ ps.scale_tensor = new_tens
+ ps.intermediates.append(new_tens)
+ ps.cascade.intermediates.append(new_tens)
+
+ cp.sram_used += tens.storage_size()
+
def schedule_passes(nng, arch, options: SchedulerOptions):
@@ -1041,6 +1083,9 @@ def schedule_passes(nng, arch, options: SchedulerOptions):
dps.apply_result(strat_set, arch)
+ if not options.keep_scale_placement:
+ dps.move_scales_to_fast_storage(sg, arch)
+
if options.verbose_schedule:
sg.print_cascaded_passes()
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 49f93cd..45518b4 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -81,16 +81,17 @@ class TensorPurpose(enum.IntFlag):
FeatureMap = 2
Scratch = 3
LUT = 4
- Size = 5
+ FSBias = 5
+ Size = 6
def display_name(self):
- return ("Unknown", "Weights", "FeatureMap", "Scratch", "LUT", "Size")[self.value]
+ return ("Unknown", "Weights", "FeatureMap", "Scratch", "LUT", "FastStorageBias", "Size")[self.value]
def identifier_name(self):
- return ("unknown", "weights", "feature_map", "scratch", "lut", "size")[self.value]
+ return ("unknown", "weights", "feature_map", "scratch", "lut", "fast_storage_bias", "size")[self.value]
def all():
- return (TensorPurpose.Weights, TensorPurpose.FeatureMap)
+ return (TensorPurpose.Weights, TensorPurpose.FeatureMap, TensorPurpose.FSBias)
class TensorSubPurpose(enum.Enum):
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 6d54187..4b43751 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -152,6 +152,9 @@ def main(args=None):
"--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
)
parser.add_argument(
+ "--keep-scale-placement", action="store_true", help="Keep scale tensors memory placement during scheduling"
+ )
+ parser.add_argument(
"--cascading",
type=ast.literal_eval,
default=True,
@@ -311,6 +314,7 @@ def main(args=None):
use_ifm_streaming=args.ifm_streaming,
pareto_metric=args.pareto_metric,
use_nhcwb16_between_cascaded_passes=args.nhcwb16_between_cascaded_passes,
+ keep_scale_placement=args.keep_scale_placement,
)
model_reader_options = model_reader.ModelReaderOptions()
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 9453521..b0187b6 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -404,7 +404,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False):
- assert tens.purpose == TensorPurpose.FeatureMap
+ assert tens.purpose in [TensorPurpose.FeatureMap, TensorPurpose.FSBias]
assert tens.format == TensorFormat.NHWC
# the connected operator should expect a bias input unless it is a FullyConnected
assert tens.consumer_list[0].type.needs_bias()
@@ -531,3 +531,9 @@ def update_pass_weight_and_scale_tensors(nng, arch):
if (ps.ops[-1].type in activation_ops) and (ps.npu_block_type != NpuBlockType.ElementWise):
rescale_for_faf = True
calc_scales_and_pack_biases(ps.scale_tensor, arch, ofm_depth_step, rescale_for_faf)
+ if ps.scale_tensor.ops[0].type == Op.DMA:
+ src_tens = ps.scale_tensor.get_dma_src_tensor()
+ src_tens.shape = ps.scale_tensor.shape
+ src_tens.quant_values = ps.scale_tensor.quant_values
+ src_tens.element_size_bytes = ps.scale_tensor.element_size_bytes
+ src_tens.copy_compressed_weight_info(ps.scale_tensor)