1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
|
# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
# untouched in the final output.
#
# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
import numpy as np
from .nn_graph import Pass
from .nn_graph import PassPlacement
from .nn_graph import Subgraph
from .operation import CustomType
from .operation import NpuBlockType
from .operation import Op
from .operation import Operation
def make_npu_call_op_pass(npu_subgraph):
op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
op.attrs["subgraph"] = npu_subgraph
op.attrs["custom_type"] = CustomType.NpuOp
ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
ps.ops = [op]
ps.primary_op = op
op.scheduled_pass = ps
# Inputs and outputs filled in later as we cut the graphs
return ps
def switch_tensor_for_op(op, orig_tens, new_tens):
op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
ps = op.scheduled_pass
if ps is None:
return
ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
if ps.ifm_tensor == orig_tens:
ps.ifm_tensor = new_tens
if ps.ifm2_tensor == orig_tens:
ps.ifm2_tensor = new_tens
if ps.ofm_tensor == orig_tens:
ps.ofm_tensor = new_tens
if ps.weight_tensor == orig_tens:
ps.weight_tensor = new_tens
if ps.scale_tensor == orig_tens:
ps.scale_tensor = new_tens
def rewrite_tensor_cpu_producer_npu_consumers(
orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
):
is_const = orig_tens.ops[0].type == Op.Const
new_tens = orig_tens.clone("_npu")
op_type = Op.SubgraphInput
if is_const:
op_type = Op.Const
op = Operation(op_type, orig_tens.name + "_input")
op.scheduled_pass = startup_init_ps
op.set_output_tensor(new_tens)
startup_init_ps.ops.append(op)
startup_init_ps.outputs.append(new_tens)
if not is_const:
call_ps.inputs.append(orig_tens)
call_ps.primary_op.inputs.append(orig_tens)
# Elementwise op can not overwrite ifm if input is used by many consumers
if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
new_tens.ifm_write_protected = True
# Elementwise op can not overwrite ifm if tensor is used as output from sub graph
if orig_tens in cpu_subgraph.output_tensors:
new_tens.ifm_write_protected = True
for op in list(orig_tens.consumers()):
if op is None:
continue # Subgraph consumers handled separately.
ps = op.scheduled_pass
if subgraph_for_pass[ps] == npu_subgraph:
switch_tensor_for_op(op, orig_tens, new_tens)
orig_tens.consumer_list.remove(op)
new_tens.consumer_list.append(op)
# Deal with output tensors for the NPU graph. These are special.
npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
for tens in npu_subgraph.output_tensors:
# Enforce output tensor from NPU graph to use normal NHWC output
tens.needs_linear_format = True
def rewrite_tensor_npu_producer_cpu_consumers(
orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
):
new_tens = orig_tens.clone("")
orig_tens.name = orig_tens.name + "_cpu"
npu_subgraph.output_tensors.append(orig_tens)
call_ps.outputs.append(new_tens)
call_ps.primary_op.outputs.append(new_tens)
new_tens.ops = [call_ps.primary_op]
for op in list(orig_tens.consumers()):
if op is None:
continue # Subgraph consumers handled separately.
ps = op.scheduled_pass
if subgraph_for_pass[ps] != npu_subgraph:
switch_tensor_for_op(op, orig_tens, new_tens)
orig_tens.consumer_list.remove(op)
new_tens.consumer_list.append(op)
# Deal with output tensors for the CPU graph. These are special.
cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
def extract_subgraph(nng, orig_sg, arch):
assert orig_sg.placement == PassPlacement.Cpu
passes = list(orig_sg.passes)
place_vec = np.array([ps.placement for ps in passes])
place_vec[
place_vec == PassPlacement.StartupInit
] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
# MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
# passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.
# Forward, then backwards
for is_reversed in range(2):
last_place = PassPlacement.Cpu
seq = enumerate(place_vec)
if is_reversed:
seq = reversed(list(seq))
for idx, place in seq:
if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:
if last_place == PassPlacement.Npu:
place = PassPlacement.Npu
place_vec[idx] = place
if place != PassPlacement.MemoryOnly:
last_place = place
# Anything left, assign to the CPU.
place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
if np.all(place_vec == PassPlacement.Cpu):
return [] # Nothing to do
# Create the subgraphs and split passes between them
new_subgraphs = []
split_count = 0
subgraph_for_pass = {}
orig_sg.passes = []
call_pass = {}
startup_init_passes = {}
last_place = PassPlacement.Cpu
curr_sg = orig_sg
for idx, place in enumerate(place_vec):
if place != last_place:
if place == PassPlacement.Npu:
split_count += 1
curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
new_subgraphs.append(curr_sg)
call_ps = make_npu_call_op_pass(curr_sg)
subgraph_for_pass[call_ps] = orig_sg
orig_sg.passes.append(call_ps)
call_pass[curr_sg] = call_ps
startup_init_ps = Pass(
curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
)
curr_sg.passes.append(startup_init_ps)
startup_init_passes[curr_sg] = startup_init_ps
subgraph_for_pass[startup_init_ps] = curr_sg
else:
curr_sg = orig_sg
last_place = place
ps = passes[idx]
subgraph_for_pass[ps] = curr_sg
curr_sg.passes.append(ps)
# Rewrite tensors to fix up graphs.
for curr_sg in new_subgraphs:
for ps in curr_sg.passes:
for tens in ps.inputs:
source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
assert len(source_sgs) >= 0
producer_sg = source_sgs[0]
for sg in source_sgs:
assert sg == producer_sg # All need to be the same.
if producer_sg != curr_sg:
assert (
producer_sg == orig_sg
) # Because we go in-order, all the producers must be the original graph.
rewrite_tensor_cpu_producer_npu_consumers(
tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
)
for tens in ps.outputs:
dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
need_rewrite = False
for sg in dest_sgs:
if sg != curr_sg:
need_rewrite = True
break
if tens in orig_sg.output_tensors:
need_rewrite = True
if need_rewrite:
rewrite_tensor_npu_producer_cpu_consumers(
tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
)
return new_subgraphs
def extract_npu_subgraphs(nng, arch):
nng.refresh_after_modification()
for sg in list(nng.subgraphs):
if sg.placement == PassPlacement.Cpu:
new_subgraphs = extract_subgraph(nng, sg, arch)
nng.subgraphs += new_subgraphs
nng.refresh_after_modification()
nng.prune_startup_init_pass()
for sg in nng.subgraphs:
sg.build_pass_links()
|