diff options
-rw-r--r-- | ethosu/vela/compiler_driver.py | 5 | ||||
-rw-r--r-- | ethosu/vela/live_range.py | 32 | ||||
-rw-r--r-- | ethosu/vela/tflite_reader.py | 26 | ||||
-rw-r--r-- | ethosu/vela/tflite_writer.py | 87 |
4 files changed, 97 insertions, 53 deletions
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py index cace0f08..61a3b0b1 100644 --- a/ethosu/vela/compiler_driver.py +++ b/ethosu/vela/compiler_driver.py @@ -233,7 +233,10 @@ def compiler_driver(nng, arch, options, scheduler_options, network_type, output_ sg, arch, scratch_tens, scratch_fast_tens, flash_tens ) - npu_serialisation.rewrite_npu_call_ops(root_sg, arch) + # Create list of CPU subgraphs with same order as the list of all subgraphs + cpu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Cpu] + for sg in cpu_subgraphs: + npu_serialisation.rewrite_npu_call_ops(sg, arch) # Set Scratch and Fast_scratch Tensor size if scratch_tens is not None: diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py index e683f9f5..9b6fe63d 100644 --- a/ethosu/vela/live_range.py +++ b/ethosu/vela/live_range.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -224,18 +224,24 @@ def extract_live_ranges_from_cascaded_passes( rng = lr_graph.get_or_create_range(tens, cpu_tensor_alignment) rng.mark_usage(time_for_pass) - cps_primary_op = cps.passes[0].primary_op - - if ( - cps_primary_op - and cps_primary_op.type == Op.CustomNpuOp - and MemType.Permanent_CPU not in target_mem_type_set - ): - # If the primary-op is an NpuOp that means this is where an Npu subgraph - # is called. Go into said subgraph and extract live ranges before continuing. - # Use default allocation alignment of 16 for Npu tensors - npu_sg = cps_primary_op.attrs["subgraph"] - lr_graph = _extract_live_ranges_from_schedule(npu_sg, target_mem_area, target_mem_type_set, lr_graph) + op_subgraph = cps.passes[0].ops[0].attrs.get("subgraph", None) + op_type = cps.passes[0].ops[0].type + + if op_subgraph is not None and MemType.Permanent_CPU not in target_mem_type_set: + if op_type == Op.CustomNpuOp: + # If the primary-op is an NpuOp that means this is where an Npu subgraph + # is called. Go into said subgraph and extract live ranges before continuing. + # Use default allocation alignment of 16 for Npu tensors + lr_graph = _extract_live_ranges_from_schedule( + op_subgraph, target_mem_area, target_mem_type_set, lr_graph + ) + else: + # The op has one or more subgraphs in it (a typical op is the While op) + # Go into all subgraphs and extract live ranges before continuing. + for op_sg in op_subgraph: + lr_graph = extract_live_ranges_from_cascaded_passes( + op_sg, target_mem_area, target_mem_type_set, lr_graph, cpu_tensor_alignment + ) # Set the new time after handling the Npu subgraph time_for_pass = lr_graph.current_time cps.time = time_for_pass diff --git a/ethosu/vela/tflite_reader.py b/ethosu/vela/tflite_reader.py index 8dc5efe1..fa90ad9e 100644 --- a/ethosu/vela/tflite_reader.py +++ b/ethosu/vela/tflite_reader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -147,6 +147,15 @@ class TFLiteSubgraph: if opt_serializer is not None: op.attrs = opt_serializer.deserialize(op_data) + if op_type == Op.While: + # Attach the actual nng subgraphs to the op + cond_subgraph_index = op.attrs["cond_subgraph_index"] + body_subgraph_index = op.attrs["body_subgraph_index"] + op.attrs["subgraph"] = ( + self.graph.nng.subgraphs[cond_subgraph_index], + self.graph.nng.subgraphs[body_subgraph_index], + ) + if op_type == Op.Reshape and "new_shape" not in op.attrs: # Reshape should have an attrib "new_shape" but if it is missing, add it based on the output shape op.attrs["new_shape"] = outputs[0].shape @@ -223,16 +232,23 @@ class TFLiteGraph: parsing_step = "parsing subgraphs length" self.subgraphs = [] + + # Pre-allocate nng subgraphs - needed when parsing an operator and the operator + # has subgraph attributes. + self.nng = Graph(self.name, self.batch_size) + for idx in range(model.SubgraphsLength()): + sg = Subgraph() + self.nng.subgraphs.append(sg) + for idx in range(model.SubgraphsLength()): parsing_step = f"parsing subgraph {idx}" self.subgraphs.append(TFLiteSubgraph(self, model.Subgraphs(idx))) - self.nng = Graph(self.name, self.batch_size) - for tflite_sg in self.subgraphs: - sg = Subgraph(tflite_sg.name) + for idx, tflite_sg in enumerate(self.subgraphs): + sg = self.nng.subgraphs[idx] + sg.name = tflite_sg.name sg.original_inputs = tflite_sg.inputs # Preserve the original input order sg.output_tensors = tflite_sg.outputs - self.nng.subgraphs.append(sg) parsing_step = "parsing metadata length" # Preserve the original metadata diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py index 7aab01f2..ce53f9b1 100644 --- a/ethosu/vela/tflite_writer.py +++ b/ethosu/vela/tflite_writer.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -71,13 +71,19 @@ def make_vector(v): class TFLiteSerialiser: + + BUF_IDX_SCRATCH = 0 # Always assign scratch to buffer 0 + BUF_IDX_SCRATCH_FAST = 1 # Always assign scratch_fast to buffer 1 + BUF_IDX_START = 2 # Unique buffer id for every tensor in all subgraphs + def __init__(self, nng): self.builder = flatbuffers.Builder(0) self.nng = nng - self.scratch_buf_id = 0 # Always assign scratch to buffer 0 - self.scratch_fast_buf_id = 1 # Always assign scratch_fast to buffer 1 + self.buf_idx = TFLiteSerialiser.BUF_IDX_START self.buffers_to_write = [] # have an empty array there + self.tensor_map_all = [] # Keep track of all subgraphs + self.tensor_map_sg = [] # Keep track of one subgraph self.ops_to_ignore = (Op.Const, Op.Placeholder, Op.SubgraphInput) @@ -154,22 +160,20 @@ class TFLiteSerialiser: buffer_map = {} - buf_idx = 2 - for tens in tensors: # Set buffer ids depending on allocation if tens.is_allocated_in_tensor_arena(scratch_tensor_mem_area): - buffer_map[tens] = self.scratch_buf_id + buffer_map[tens] = TFLiteSerialiser.BUF_IDX_SCRATCH elif tens.mem_type == MemType.Scratch_fast: # For Scratch_fast when not co-allocated with scratch in the TensorArena: - buffer_map[tens] = self.scratch_fast_buf_id + buffer_map[tens] = TFLiteSerialiser.BUF_IDX_SCRATCH_FAST else: - buffer_map[tens] = buf_idx - buf_idx += 1 + buffer_map[tens] = self.buf_idx + self.buf_idx += 1 - # Initialize buffers_to_write to a length equal to number of buffers so + # Initialize/extend buffers_to_write to a length equal to number of buffers so # they can be appended at the correct index during tensor serialization - self.buffers_to_write = [None] * (buf_idx) + self.buffers_to_write += [None] * (self.buf_idx) return buffer_map @@ -281,13 +285,13 @@ class TFLiteSerialiser: builder = self.builder inputs_offset = self.write_int_vector( - [self.tensor_map[tens] if tens in self.tensor_map else -1 for tens in op.inputs] + [self.tensor_map_sg[tens] if tens in self.tensor_map_sg else -1 for tens in op.inputs] ) outputs_offset = self.write_int_vector( - [self.tensor_map[tens] for tens in op.outputs if tens in self.tensor_map] + [self.tensor_map_sg[tens] for tens in op.outputs if tens in self.tensor_map_sg] ) intermediates_offset = self.write_int_vector( - [self.tensor_map[tens] for tens in op.intermediates if tens in self.tensor_map] + [self.tensor_map_sg[tens] for tens in op.intermediates if tens in self.tensor_map_sg] ) if op.type == Op.Custom: @@ -331,9 +335,8 @@ class TFLiteSerialiser: Operator.OperatorAddMutatingVariableInputs(builder, mutating_variable_inputs_offset) return Operator.OperatorEnd(builder) - def serialise_subgraph(self, sg): + def serialise_subgraph(self, sg, name): builder = self.builder - tensor_set = set() all_ops = [] placeholder_ops = [] @@ -344,6 +347,14 @@ class TFLiteSerialiser: elif op.type == Op.Placeholder: placeholder_ops.append(op) + # Make sure all original tensors are written back, special case for Ops + # with connected subgraphs. Even though not all inputs are used, + # the reference kernel expects all inputs to be in the tflite file. + # Since we traverse the graph starting with all outputs they are + # always added but if an input is not referenced it will not be added + # to an op. + tensor_set = set(sg.original_inputs) + # Add the tensors from all valid ops, as well as the tensors from placeholder ops # This allows us to serialise tensors which arent attached to any specific ops, # e.g. due to an empty graph containing no ops @@ -362,18 +373,19 @@ class TFLiteSerialiser: assert len(scratch_tensors) == 1, "Multiple scratch tensors" scratch_tensor = scratch_tensors[0] - self.tensor_map = {tens: idx for idx, tens in enumerate(all_tensors)} + self.tensor_map_sg = {tens: idx for idx, tens in enumerate(all_tensors)} self.buffer_map = self.assign_buffers_to_tensors(all_tensors, scratch_tensor) + self.tensor_map_all.append(self.tensor_map_sg) tensors_offset = self.write_offset_vector([self.serialise_tensor(tens) for tens in all_tensors]) # Make sure the input_tensors haven't been modified assert all(inp in sg.original_inputs for inp in sg.input_tensors) - inputs = [self.tensor_map[tens] for tens in sg.original_inputs if tens in self.tensor_map] + inputs = [self.tensor_map_sg[tens] for tens in sg.original_inputs if tens in self.tensor_map_sg] inputs_offset = self.write_int_vector(inputs) outputs_offset = self.write_int_vector( - [self.tensor_map[tens] for tens in sg.output_tensors if tens in self.tensor_map] + [self.tensor_map_sg[tens] for tens in sg.output_tensors if tens in self.tensor_map_sg] ) operators_offset = self.write_offset_vector([self.serialise_operator(op) for op in all_ops]) @@ -384,6 +396,7 @@ class TFLiteSerialiser: SubGraph.SubGraphAddOutputs(builder, outputs_offset) SubGraph.SubGraphAddOperators(builder, operators_offset) + SubGraph.SubGraphAddName(builder, name) return SubGraph.SubGraphEnd(builder) @@ -427,26 +440,32 @@ class TFLiteSerialiser: description = builder.CreateString("Vela Optimised") - subgraph_offset = self.write_offset_vector([self.serialise_subgraph(sg) for sg in self.subgraphs_to_write]) + subgraph_offset = self.write_offset_vector( + [self.serialise_subgraph(sg, builder.CreateString(sg.name)) for sg in self.subgraphs_to_write] + ) # Fill the metadata buffer version = np.int32(0) - subgraph_idx = np.int32(len(self.subgraphs_to_write)) # Only 1 supported currently - nbr_tensors = np.int32(len(self.tensor_map)) + subgraph_idx = np.int32(len(self.subgraphs_to_write)) + + nbr_tensors_all = np.sum([len(tensor_map_sg) for tensor_map_sg in self.tensor_map_all], dtype=np.int32) + + offlineAlloc = [version, subgraph_idx, nbr_tensors_all] if not any([name == b"OfflineMemoryAllocation" for name, _ in self.nng.metadata]): - # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro - offsets = [np.int32(-1)] * nbr_tensors - - # Ensure that the order of the offsets match the order of the tensors - for tens, idx in self.tensor_map.items(): - # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area - if tens.mem_type in (MemType.Scratch, MemType.Scratch_fast): - offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0) - - self.nng.metadata.append( - ("OfflineMemoryAllocation", np.array([version, subgraph_idx, nbr_tensors] + offsets)) - ) + for tensor_map_sg in self.tensor_map_all: + nbr_tensors_sg = np.int32(len(tensor_map_sg)) + # An offset of -1 indicates that the tensor will be allocated online by Tensorflow Lite Micro + offsets = [np.int32(-1)] * nbr_tensors_sg + # Ensure that the order of the offsets match the order of the tensors + for tens, idx in tensor_map_sg.items(): + # Set offsets for tensor allocated in Tensor Arena or in the scratch_fast area + if tens.mem_type in (MemType.Scratch, MemType.Scratch_fast): + offsets[idx] = np.int32(tens.address) if tens.address is not None else np.int32(0) + + offlineAlloc += offsets + + self.nng.metadata.append(("OfflineMemoryAllocation", np.array(offlineAlloc))) metadata_list = [] for name, buffer in self.nng.metadata: |