src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453

/*
 * Copyright (c) 2022 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/experimental/DependencyGraph.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
#include "support/DeepCopy.h"

#include <vector>

namespace arm_compute
{
namespace experimental
{
namespace dynamic_fusion
{
struct ClKernelFusionGroup;

/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
 *
 */
struct ClKernelFusionGroup
{
public:
    using Id = DependencyGraph::Id;

    ClKernelFusionGroup() = default;
    ClKernelFusionGroup(Id id)
        : id{ id }, graph{}, fused_kernels{}, tensors{}
    {
    }
    ~ClKernelFusionGroup() = default;

    void set_id(Id i)
    {
        id = i;
    }

    Id add_fused_kernel(const ClKernel *kernel)
    {
        /// PRE: Acyclicity ensured by DependencyGraph
        /// PRE: Connectedness ensured by DependencyGraph
        /// PRE: Single-rootedness ensured by User
        std::vector<Id> src_tensors;
        for(const auto t : kernel->tensors().get_const_src_tensors())
        {
            auto id = graph.add_tensor(t->id);
            if(tensors.find(id) == tensors.end())
            {
                tensors[id] = t;
            }
            src_tensors.push_back(id);
        }
        std::vector<Id> dst_tensors;
        for(const auto t : kernel->tensors().get_const_dst_tensors())
        {
            auto id = graph.add_tensor(t->id);
            if(tensors.find(id) == tensors.end())
            {
                tensors[id] = t;
            }
            dst_tensors.push_back(id);
        }
        auto id                  = graph.add_operator(src_tensors, dst_tensors);
        fused_kernels[id.second] = kernel;
        return id.second;
    }

    const ClKernel *get_root_kernel() const
    {
        auto root_kernels = graph.get_root_ops();
        ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
        return fused_kernels.at(root_kernels.at(0));
    }

    std::vector<const ClKernelTensor *> get_src_tensors() const
    {
        std::vector<const ClKernelTensor *> src_tensors;
        for(auto tensor_id : graph.src_tensors())
        {
            src_tensors.push_back(tensors.at(tensor_id));
        }
        return src_tensors;
    }

    std::vector<const ClKernelTensor *> get_dst_tensors() const
    {
        std::vector<const ClKernelTensor *> dst_tensors;
        for(auto tensor_id : graph.dst_tensors())
        {
            dst_tensors.push_back(tensors.at(tensor_id));
        }
        return dst_tensors;
    }

    friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
    {
        return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
    }

    Id              id{};
    DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
    std::map<Id, const ClKernel *>       fused_kernels{};
    std::map<Id, const ClKernelTensor *> tensors{};
};

std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);

struct ClFusedKernelGraph
{
public:
    using Id = DependencyGraph::Id;

    using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;

    ClFusedKernelGraph()                                = default;
    ~ClFusedKernelGraph()                               = default;
    ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
    ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
    ClFusedKernelGraph(ClFusedKernelGraph &&graph)                 = default;
    ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;

    friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
    {
        /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
        ///       The fusion groups can already guarantee the equivalence of fusion
        ///       In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
        return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
    }

    Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
    {
        auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
        for(const auto k : fused_kernels)
        {
            fg->add_fused_kernel(k);
        }
        const auto      src_tensors = fg->get_src_tensors();
        const auto      dst_tensors = fg->get_dst_tensors();
        std::vector<Id> inputs{};
        std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
        {
            return fg_dependency.add_tensor(kernel->id);
        });
        std::vector<Id> outputs{};
        std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
        {
            return fg_dependency.add_tensor(kernel->id);
        });
        const auto id = fg_dependency.add_operator(inputs, outputs);
        fg->set_id(id.second);
        fusion_groups[id.second] = std::move(fg);
        return id.second;
    }

    Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
    {
        /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
        ClKernelFusionGroup *fg_src{};
        ClKernelFusionGroup *fg_dst{};
        // Find fg_src (parent / root) and fg_dst (child / non-root)
        if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
        {
            fg_src = &fg0;
            fg_dst = &fg1;
        }
        else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
        {
            fg_src = &fg1;
            fg_dst = &fg0;
        }
        else
        {
            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
        }

        for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
        {
            if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
            {
                // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src

                // Before:
                // fg_src
                // |
                // ..          t1
                // |           |
                // -> fg_dst <-
                //
                // After:
                // fg_src <---t1
                //
                const auto st = link_src_tensors(fg_src->id, { t });
                if(!bool(st))
                {
                    return st;
                }
            }
            else
            {
                const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
                if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
                {
                    // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
                    // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))

                    // Before:
                    // fg_src
                    // |
                    // t0
                    // |
                    // -> fg_dst
                    //
                    // After:
                    // fg_src
                    //
                    const auto st = remove_fg_tensor(t);
                    if(!bool(st))
                    {
                        return st;
                    }
                }
                else
                {
                    // If the tensors ARE in between fg_src and fg_dst
                    // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src

                    // Before:
                    // fg_src
                    // |
                    // t0
                    // |
                    // |-----------
                    // |          |
                    // -> fg_dst  -> fg_other
                    //
                    // After:
                    // fg_src
                    // |
                    // t0
                    // |
                    // -> fg_other
                    //

                    // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
                    // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
                    // so that any unfused fg's (fg_other in this case) can read it.
                    // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
                    // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
                    // can be termed "dst tensors" to a fusion group
                    void();
                }
            }
        }

        for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
        {
            // Link any outgoing tensors of fg_dst to fg_src

            // Before:
            // fg_src
            // |
            // ..
            // |
            // -> fg_dst
            //    |
            //    |--------
            //    |       |
            //    |-> t0  |-> t1
            //
            // After:
            // fg_src
            // |
            // |--------
            // |       |
            // |-> t0  |-> t1
            //
            const auto st = link_dst_tensors(fg_src->id, { t });
            if(!bool(st))
            {
                return st;
            }
        }

        // Merge fg_dst's graph into fg_src's graph
        for(const auto kernel : traverse(*fg_dst))
        {
            fg_src->add_fused_kernel(kernel);
        }

        const auto st = remove_fg(fg_dst->id);
        return st;
    }
    Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
    {
        /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
        /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
        ///              Note that this does not apply to fusion groups
        /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
        /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
        /// INV0: All Fusion groups have a single root
        /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
        /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
        /// INV3: All non-roots are Simple kernels
        /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
        /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
        const ClKernelFusionGroup *fg_src {};
        const ClKernelFusionGroup *fg_dst{};

        // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
        // This guarantess INV0
        // This also finds fg_src (parent / root) and fg_dst (child / non-root)
        if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
        {
            fg_src = &fg0;
            fg_dst = &fg1;
        }
        else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
        {
            fg_src = &fg1;
            fg_dst = &fg0;
        }
        else
        {
            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
        }

        // Find unconnected tensors between fg_src and fg_dst
        std::vector<Id> unconnected_tensors{};
        for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
        {
            if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
            {
                unconnected_tensors.push_back(t);
            }
        }

        // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
        // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
        for(const auto &t : unconnected_tensors)
        {
            if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
            {
                return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
            }
        }

        // Check 2: All non-root fgs are simple. Ensure INV3
        if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
        {
            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
        }

        // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
        const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
        ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
        const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;

        for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
        {
            const auto t_info = original_graph->get_tensor(t)->desc;
            if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
            {
                return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
            }
        }

        // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
        if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
        {
            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
        }

        return Status{};
    }

    const ClKernelGraph *original_graph{};
    DependencyGraph      fg_dependency{};
    KernelFusionGroupMap fusion_groups{};
    // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.

private:
    Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
    {
        for(auto t : src_tensors)
        {
            fg_dependency.link_input(fg, t);
        }
        return Status{};
    }
    Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
    {
        for(auto t : dst_tensors)
        {
            fg_dependency.link_output(fg, t);
        }
        return Status{};
    }
    Status remove_fg(Id fg)
    {
        fg_dependency.remove_operator(fg);
        fusion_groups.erase(fg);
        return Status{};
    }
    Status remove_fg_tensor(Id tensor)
    {
        fg_dependency.remove_tensor(tensor);
        return Status{};
    }
};

std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);

std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);

Status fuse(ClFusedKernelGraph &fused_kernel_graph);

Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);

Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);

} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H