1 files changed, 453 insertions, 0 deletions
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
new file mode 100644
index 0000000000..4bd3cd9d8b
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "support/DeepCopy.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClKernelFusionGroup;
+
+/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
+ *
+ */
+struct ClKernelFusionGroup
+{
+public:
+    using Id = DependencyGraph::Id;
+
+    ClKernelFusionGroup() = default;
+    ClKernelFusionGroup(Id id)
+        : id{ id }, graph{}, fused_kernels{}, tensors{}
+    {
+    }
+    ~ClKernelFusionGroup() = default;
+
+    void set_id(Id i)
+    {
+        id = i;
+    }
+
+    Id add_fused_kernel(const ClKernel *kernel)
+    {
+        /// PRE: Acyclicity ensured by DependencyGraph
+        /// PRE: Connectedness ensured by DependencyGraph
+        /// PRE: Single-rootedness ensured by User
+        std::vector<Id> src_tensors;
+        for(const auto t : kernel->tensors().get_const_src_tensors())
+        {
+            auto id = graph.add_tensor(t->id);
+            if(tensors.find(id) == tensors.end())
+            {
+                tensors[id] = t;
+            }
+            src_tensors.push_back(id);
+        }
+        std::vector<Id> dst_tensors;
+        for(const auto t : kernel->tensors().get_const_dst_tensors())
+        {
+            auto id = graph.add_tensor(t->id);
+            if(tensors.find(id) == tensors.end())
+            {
+                tensors[id] = t;
+            }
+            dst_tensors.push_back(id);
+        }
+        auto id                  = graph.add_operator(src_tensors, dst_tensors);
+        fused_kernels[id.second] = kernel;
+        return id.second;
+    }
+
+    const ClKernel *get_root_kernel() const
+    {
+        auto root_kernels = graph.get_root_ops();
+        ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
+        return fused_kernels.at(root_kernels.at(0));
+    }
+
+    std::vector<const ClKernelTensor *> get_src_tensors() const
+    {
+        std::vector<const ClKernelTensor *> src_tensors;
+        for(auto tensor_id : graph.src_tensors())
+        {
+            src_tensors.push_back(tensors.at(tensor_id));
+        }
+        return src_tensors;
+    }
+
+    std::vector<const ClKernelTensor *> get_dst_tensors() const
+    {
+        std::vector<const ClKernelTensor *> dst_tensors;
+        for(auto tensor_id : graph.dst_tensors())
+        {
+            dst_tensors.push_back(tensors.at(tensor_id));
+        }
+        return dst_tensors;
+    }
+
+    friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
+    {
+        return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
+    }
+
+    Id              id{};
+    DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
+    std::map<Id, const ClKernel *>       fused_kernels{};
+    std::map<Id, const ClKernelTensor *> tensors{};
+};
+
+std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
+
+struct ClFusedKernelGraph
+{
+public:
+    using Id = DependencyGraph::Id;
+
+    using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
+
+    ClFusedKernelGraph()                                = default;
+    ~ClFusedKernelGraph()                               = default;
+    ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
+    ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
+    ClFusedKernelGraph(ClFusedKernelGraph &&graph)                 = default;
+    ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
+
+    friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
+    {
+        /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
+        ///       The fusion groups can already guarantee the equivalence of fusion
+        ///       In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
+        return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
+    }
+
+    Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
+    {
+        auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
+        for(const auto k : fused_kernels)
+        {
+            fg->add_fused_kernel(k);
+        }
+        const auto      src_tensors = fg->get_src_tensors();
+        const auto      dst_tensors = fg->get_dst_tensors();
+        std::vector<Id> inputs{};
+        std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
+        {
+            return fg_dependency.add_tensor(kernel->id);
+        });
+        std::vector<Id> outputs{};
+        std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
+        {
+            return fg_dependency.add_tensor(kernel->id);
+        });
+        const auto id = fg_dependency.add_operator(inputs, outputs);
+        fg->set_id(id.second);
+        fusion_groups[id.second] = std::move(fg);
+        return id.second;
+    }
+
+    Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
+    {
+        /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
+        ClKernelFusionGroup *fg_src{};
+        ClKernelFusionGroup *fg_dst{};
+        // Find fg_src (parent / root) and fg_dst (child / non-root)
+        if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
+        {
+            fg_src = &fg0;
+            fg_dst = &fg1;
+        }
+        else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
+        {
+            fg_src = &fg1;
+            fg_dst = &fg0;
+        }
+        else
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
+        }
+
+        for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
+        {
+            if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
+            {
+                // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
+
+                // Before:
+                // fg_src
+                // |
+                // ..          t1
+                // |           |
+                // -> fg_dst <-
+                //
+                // After:
+                // fg_src <---t1
+                //
+                const auto st = link_src_tensors(fg_src->id, { t });
+                if(!bool(st))
+                {
+                    return st;
+                }
+            }
+            else
+            {
+                const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
+                if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
+                {
+                    // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
+                    // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
+
+                    // Before:
+                    // fg_src
+                    // |
+                    // t0
+                    // |
+                    // -> fg_dst
+                    //
+                    // After:
+                    // fg_src
+                    //
+                    const auto st = remove_fg_tensor(t);
+                    if(!bool(st))
+                    {
+                        return st;
+                    }
+                }
+                else
+                {
+                    // If the tensors ARE in between fg_src and fg_dst
+                    // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
+
+                    // Before:
+                    // fg_src
+                    // |
+                    // t0
+                    // |
+                    // |-----------
+                    // |          |
+                    // -> fg_dst  -> fg_other
+                    //
+                    // After:
+                    // fg_src
+                    // |
+                    // t0
+                    // |
+                    // -> fg_other
+                    //
+
+                    // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
+                    // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
+                    // so that any unfused fg's (fg_other in this case) can read it.
+                    // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
+                    // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
+                    // can be termed "dst tensors" to a fusion group
+                    void();
+                }
+            }
+        }
+
+        for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
+        {
+            // Link any outgoing tensors of fg_dst to fg_src
+
+            // Before:
+            // fg_src
+            // |
+            // ..
+            // |
+            // -> fg_dst
+            //    |
+            //    |--------
+            //    |       |
+            //    |-> t0  |-> t1
+            //
+            // After:
+            // fg_src
+            // |
+            // |--------
+            // |       |
+            // |-> t0  |-> t1
+            //
+            const auto st = link_dst_tensors(fg_src->id, { t });
+            if(!bool(st))
+            {
+                return st;
+            }
+        }
+
+        // Merge fg_dst's graph into fg_src's graph
+        for(const auto kernel : traverse(*fg_dst))
+        {
+            fg_src->add_fused_kernel(kernel);
+        }
+
+        const auto st = remove_fg(fg_dst->id);
+        return st;
+    }
+    Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
+    {
+        /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
+        /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
+        ///              Note that this does not apply to fusion groups
+        /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
+        /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
+        /// INV0: All Fusion groups have a single root
+        /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
+        /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
+        /// INV3: All non-roots are Simple kernels
+        /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
+        /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
+        const ClKernelFusionGroup *fg_src {};
+        const ClKernelFusionGroup *fg_dst{};
+
+        // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
+        // This guarantess INV0
+        // This also finds fg_src (parent / root) and fg_dst (child / non-root)
+        if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
+        {
+            fg_src = &fg0;
+            fg_dst = &fg1;
+        }
+        else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
+        {
+            fg_src = &fg1;
+            fg_dst = &fg0;
+        }
+        else
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
+        }
+
+        // Find unconnected tensors between fg_src and fg_dst
+        std::vector<Id> unconnected_tensors{};
+        for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
+        {
+            if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
+            {
+                unconnected_tensors.push_back(t);
+            }
+        }
+
+        // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
+        // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
+        for(const auto &t : unconnected_tensors)
+        {
+            if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
+            {
+                return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
+            }
+        }
+
+        // Check 2: All non-root fgs are simple. Ensure INV3
+        if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
+        }
+
+        // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
+        const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
+        ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
+        const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
+
+        for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
+        {
+            const auto t_info = original_graph->get_tensor(t)->desc;
+            if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
+            {
+                return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
+            }
+        }
+
+        // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
+        if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
+        {
+            return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
+        }
+
+        return Status{};
+    }
+
+    const ClKernelGraph *original_graph{};
+    DependencyGraph      fg_dependency{};
+    KernelFusionGroupMap fusion_groups{};
+    // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
+
+private:
+    Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
+    {
+        for(auto t : src_tensors)
+        {
+            fg_dependency.link_input(fg, t);
+        }
+        return Status{};
+    }
+    Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
+    {
+        for(auto t : dst_tensors)
+        {
+            fg_dependency.link_output(fg, t);
+        }
+        return Status{};
+    }
+    Status remove_fg(Id fg)
+    {
+        fg_dependency.remove_operator(fg);
+        fusion_groups.erase(fg);
+        return Status{};
+    }
+    Status remove_fg_tensor(Id tensor)
+    {
+        fg_dependency.remove_tensor(tensor);
+        return Status{};
+    }
+};
+
+std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
+std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
+
+std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
+
+Status fuse(ClFusedKernelGraph &fused_kernel_graph);
+
+Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
+
+Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+\ No newline at end of file