1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
|
# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler.
# Can work with either a pass packed subgraph or a scheduled subgraph.
from .high_level_command_stream_generator import calc_allowed_ofm_ifm_overlap_for_cascaded_pass
from .nn_graph import PassPlacement
from .tensor import MemType
from .tensor import Tensor
class LiveRange:
def __init__(self, tens, alignment):
self.tensors = [] # Tensors that are assigned to the same LiveRange will be allocated to the same address
self.start_time = 99999999999
self.end_time = -1
self.size = 0
self.name = ""
self.alignment = alignment
if tens:
self.add_tensor(tens)
def __str__(self):
return "<live_range.LiveRange: '%s' start_time=%s, end_time=%s>" % (self.name, self.start_time, self.end_time)
__repr__ = __str__
def add_tensor(self, tens):
if self.size == 0:
self.size = tens.storage_size()
self.name = tens.name # LiveRange will be named after the first tensor added
else:
assert (
self.size >= tens.storage_size()
), "Tensors assigned to the same LiveRange need to fit the size of the LiveRange."
self.tensors.append(tens)
def mark_usage(self, op_time):
if op_time == -1:
return
op_time_start = op_time
op_time_end = op_time + 1
self.start_time = min(self.start_time, op_time_start)
self.end_time = max(self.end_time, op_time_end)
def overlaps_ranges(self, other):
return max(self.start_time, other.start_time) < min(self.end_time, other.end_time)
def overlaps_address(self, other):
# Returns the first pair of tensors in this LiveRange and 'other' which have
# overlapping addresses
for tens in self.tensors:
for other_tens in other.tensors:
if max(tens.address, other_tens.address) < min(
tens.address + self.size, other_tens.address + other.size
):
return True, tens, other_tens
return False, None, None
def __lt__(self, other):
if self.start_time != other.start_time:
return self.start_time < other.start_time
if self.end_time != other.end_time:
return self.end_time < other.end_time
if self.size != other.size:
return self.size < other.size
return self.name < other.name
def set_address(self, address):
# Set address of all tensors in LiveRange
for tens in self.tensors:
tens.address = address
return address
def get_alignment(self):
return self.alignment
def set_alignment(self, alignment):
self.alignment = max(self.alignment, alignment)
def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area):
for ps in sg.passes:
if ps.placement == PassPlacement.MemoryOnly:
# For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange
input_tensor = ps.inputs[0]
output_tensor = ps.outputs[0]
if not tensor_should_be_ignored(input_tensor, target_mem_area) and not tensor_should_be_ignored(
output_tensor, target_mem_area
):
lr_graph.fuse_ranges(input_tensor, output_tensor)
class LiveRangeGraph:
def __init__(self):
self.ranges = {} # tens -> range
self.allowed_overlaps = {} # (tens,tens) -> overlap_int
self.ignore_tensors = set()
self.processed_subgraphs = set()
self.current_time = 0
def get_or_create_range(self, tens, alignment=Tensor.AllocationQuantum):
# Return the live range of the tensor (or any of its clones)
for existing_tensor, rng in self.ranges.items():
if tens.equivalent(existing_tensor):
rng.set_alignment(alignment)
return rng
# No live range found for the tensor, create a new one
rng = LiveRange(tens, alignment)
self.ranges[tens] = rng
return rng
def fuse_ranges(self, in_tens, out_tens):
live_range = self.get_or_create_range(in_tens)
assert out_tens not in self.ranges, out_tens
live_range.add_tensor(out_tens)
self.ranges[out_tens] = live_range
return live_range
def extract_live_ranges_from_passes(
sg,
target_mem_area,
mark_output_tensors_overlapping_with_input_tensors=False,
ignore_subgraph_input_output_tensors=False,
):
lr_graph = LiveRangeGraph()
if ignore_subgraph_input_output_tensors:
lr_graph.ignore_tensors.update(sg.input_tensors)
lr_graph.ignore_tensors.update(sg.output_tensors)
def tensor_should_be_ignored(tens, target_mem_area):
if tens.mem_area != target_mem_area:
return True
if tens in lr_graph.ignore_tensors:
return True
if tens.name.endswith("reshape_shape_npu"):
# Reshape tensor, no need to allocate
lr_graph.ignore_tensors.add(tens)
return True
return False
# Merge only memory operations in the NPU subgraphs
if sg.placement == PassPlacement.Npu:
merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area)
for idx, ps in enumerate(sg.passes):
ps.time = 2 * idx
time_for_pass = ps.time
for tens in ps.inputs:
if tensor_should_be_ignored(tens, target_mem_area):
continue
rng = lr_graph.get_or_create_range(tens)
rng.mark_usage(time_for_pass)
for tens in ps.intermediates:
if tensor_should_be_ignored(tens, target_mem_area):
continue
rng = lr_graph.get_or_create_range(tens)
rng.mark_usage(time_for_pass)
for tens in ps.outputs:
if tensor_should_be_ignored(tens, target_mem_area):
continue
rng = lr_graph.get_or_create_range(tens)
output_time = time_for_pass
if not mark_output_tensors_overlapping_with_input_tensors and ps.is_element_wise:
output_time += 1
rng.mark_usage(output_time)
end_time = len(sg.passes) * 2
for tens in sg.output_tensors:
if tensor_should_be_ignored(tens, target_mem_area):
continue
rng = lr_graph.get_or_create_range(tens)
rng.mark_usage(end_time)
return lr_graph
def extract_live_ranges_from_cascaded_passes(
sg,
target_mem_area,
target_mem_type_set,
mark_output_tensors_overlapping_with_input_tensors=False,
use_ifm_ofm_overlap=True,
ignore_subgraph_input_output_tensors=False,
lr_graph=None,
allocation_alignment=Tensor.AllocationQuantum,
):
if lr_graph is None:
lr_graph = LiveRangeGraph()
if sg in lr_graph.processed_subgraphs:
# if subgraph has been processed already, return the lr_graph as is
return lr_graph
if ignore_subgraph_input_output_tensors:
lr_graph.ignore_tensors.update(sg.input_tensors)
lr_graph.ignore_tensors.update(sg.output_tensors)
def tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
if tens.mem_area != target_mem_area or tens.mem_type not in target_mem_type_set:
return True
if tens in lr_graph.ignore_tensors:
return True
if tens.name.endswith("reshape_shape_npu"):
# Reshape tensor, no need to allocate
lr_graph.ignore_tensors.add(tens)
return True
return False
def merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area, target_mem_type_set):
for ps in sg.passes:
if ps.placement == PassPlacement.MemoryOnly:
# For memory only passes, e.g. Reshape. Add input and output tensor to the same LiveRange
input_tensor = ps.inputs[0]
output_tensor = ps.outputs[0]
if not tensor_should_be_ignored(input_tensor, target_mem_area, target_mem_type_set) and not (
tensor_should_be_ignored(output_tensor, target_mem_area, target_mem_type_set)
):
lr_graph.fuse_ranges(input_tensor, output_tensor)
# Merge only memory operations in the NPU subgraphs
if sg.placement == PassPlacement.Npu:
merge_memory_op_ranges(sg, lr_graph, tensor_should_be_ignored, target_mem_area, target_mem_type_set)
for cps in sg.cascaded_passes:
cps.time = lr_graph.current_time
time_for_pass = cps.time
is_element_wise = cps.is_element_wise
for tens in cps.inputs:
if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens, allocation_alignment)
rng.mark_usage(time_for_pass)
cps_primary_op = cps.passes[0].primary_op
if cps_primary_op and cps_primary_op.type == "NpuOp" and MemType.Permanent_CPU not in target_mem_type_set:
# If the primary-op is an NpuOp that means this is where an Npu subgraph
# is called. Go into said subgraph and extract live ranges before continuing.
# Use default allocation alignment of 16 for Npu tensors
npu_sg = cps_primary_op.attrs["subgraph"]
lr_graph = extract_live_ranges_from_cascaded_passes(
npu_sg,
target_mem_area,
target_mem_type_set,
mark_output_tensors_overlapping_with_input_tensors,
use_ifm_ofm_overlap,
False,
lr_graph,
)
# Set the new time after handling the Npu subgraph
time_for_pass = lr_graph.current_time
cps.time = time_for_pass
for tens in cps.intermediates:
if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens, allocation_alignment)
rng.mark_usage(time_for_pass)
for tens in cps.outputs:
if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens, allocation_alignment)
output_time = time_for_pass
if not mark_output_tensors_overlapping_with_input_tensors and is_element_wise:
output_time += 1
rng.mark_usage(output_time)
if use_ifm_ofm_overlap:
# fill allowed overlap for ifm and ofm tensor
ifm_tensor = cps.passes[0].ifm_tensor
ofm_tensor = cps.passes[-1].ofm_tensor
if (
ifm_tensor is not None
and ofm_tensor is not None
and not tensor_should_be_ignored(ifm_tensor, target_mem_area, target_mem_type_set)
and not tensor_should_be_ignored(ofm_tensor, target_mem_area, target_mem_type_set)
):
lr_graph.allowed_overlaps[(ifm_tensor, ofm_tensor)] = calc_allowed_ofm_ifm_overlap_for_cascaded_pass(
cps
)
lr_graph.current_time += 2
end_time = 0
for rng in lr_graph.ranges.values():
# Find the maximum end time of all live-ranges in the graph
end_time = max(end_time, rng.end_time)
for tens in sg.output_tensors:
if tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set):
continue
rng = lr_graph.get_or_create_range(tens, allocation_alignment)
rng.mark_usage(end_time)
# Add subgraph to set of processed subgraphs
lr_graph.processed_subgraphs.add(sg)
return lr_graph
|