diff options
author | SiCong Li <sicong.li@arm.com> | 2022-01-28 18:24:39 +0000 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2022-05-06 15:01:45 +0000 |
commit | b63b1196adea8b07dd8db77c2492a212650deba0 (patch) | |
tree | b264035197873f56c69784bec68cad7041b5d423 /src/core/experimental/dynamic_fusion/WorkloadImpl | |
parent | 3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 (diff) | |
download | ComputeLibrary-b63b1196adea8b07dd8db77c2492a212650deba0.tar.gz |
Integrate Dynamic Fusion patches
* Add public interfaces:
* OperatorGraph: Describe a workload that could contain fused kernels
* IWorkload: Generic interface for workloads built from OperatorGraph
* ClWorkload: OpenCL workloads built from OperatorGraph
* ClCompositeOperator: Runtime async operator to execute a ClWorkload
* DependencyGraph (will likely be deprecated in later iterations)
* Add example
* cl_fused_conv2d_elementwise_add.cpp to explain how to use the new
interfaces
* Add internal translation layer
* Refactor ClKernelBuildingAPI
* Remove non-tile based gemm native kernel component
* Minor interface changes
* Add integration tests
Resolves COMPMID-5161
Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/experimental/dynamic_fusion/WorkloadImpl')
10 files changed, 2619 insertions, 0 deletions
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp new file mode 100644 index 0000000000..7e9f6b870a --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> get_combinations(const std::vector<ClKernelFusionGroup *> &sorted_fgs) +{ + ARM_COMPUTE_ERROR_ON(sorted_fgs.size() <= 1); + std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> combo; + for(size_t i = 0; i < sorted_fgs.size() - 1; ++i) + { + for(size_t j = i + 1; j < sorted_fgs.size(); ++j) + { + combo.push_back(std::make_pair(sorted_fgs.at(i), sorted_fgs.at(j))); + } + } + return combo; +} +} // namespace +std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group) +{ + std::vector<const ClKernel *> kernels; + const auto sorted = group.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(group.fused_kernels.at(pack.op)); + } + return kernels; +} + +std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph) +{ + std::vector<const ClKernelFusionGroup *> kernels; + const auto sorted = graph.fg_dependency.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.fusion_groups.at(pack.op).get()); + } + return kernels; +} + +std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph) +{ + std::vector<ClKernelFusionGroup *> kernels; + const auto sorted = graph.fg_dependency.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.fusion_groups.at(pack.op).get()); + } + return kernels; +} + +std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph) +{ + ClFusedKernelGraph fused_kernel_graph{}; + fused_kernel_graph.original_graph = &kernel_graph; // Create a copy of the original kernel graph + fused_kernel_graph.fg_dependency = DependencyGraph(); + // Initialize all fusion groups + for(const auto &kernel : traverse(kernel_graph)) + { + fused_kernel_graph.add_fusion_group({ kernel }); + } + return { Status{}, fused_kernel_graph }; +} + +Status fuse(ClFusedKernelGraph &fused_kernel_graph) +{ + // A naive fusion algorithm that's guaranteed to find optimal pattern if there are no branches + // If there are branches, the algorithm cannot guanrantee optimality as it doesn't perform any searches + + bool fusion_found = false; + do + { + fusion_found = false; + const auto sorted_fgs = traverse(fused_kernel_graph); + if(sorted_fgs.size() <= 1) + { + // Only one or zero fusion group, thus no need to perform fusion + return Status{}; + } + auto fgs_combo = get_combinations(sorted_fgs); + for(auto fgs : fgs_combo) + { + auto fg0 = fgs.first; + auto fg1 = fgs.second; + const auto st = fused_kernel_graph.can_fuse(*fg0, *fg1); + if(bool(st)) + { + const auto st = fused_kernel_graph.fuse(*fg0, *fg1); + if(!bool(st)) + { + return st; + } + fusion_found = true; + break; + } + } + } + while(fusion_found); + return Status{}; +} +Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg) +{ + Status st{}; + for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg.id)) + { + const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id); + + /// NOTE: dst tensor must have already been added to the blueprint at this point + ArgumentID dst_id; + st = add_tensor(bp, dst_t->desc, dst_id, dst_t->id); + if(!bool(st)) + { + return st; + } + /// NOTE: the extra dst tensor is needed as the store kcomp requires 2 tensors. But this is irrelevant to the fused kernel graph + /// since both tensors share the exact same info and kernel arg descriptor + ArgumentID dst_dst_id; + st = add_tensor(bp, dst_t->desc, dst_dst_id); + if(!bool(st)) + { + return st; + } + /// NOTE: Update the merge point map to link dst_dst_id with dst_t->id instead. + /// This is required because the get_arguments() returned by the blueprint returns the dst tensor added by the store component + st = update_merge_point(bp, dst_dst_id, dst_t->id); + if(!bool(st)) + { + return st; + } + st = add_kcomp_store(bp, fg.get_root_kernel()->config().store_type, dst_id, dst_dst_id); + if(!bool(st)) + { + return st; + } + } + return st; +} + +Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph) +{ + workload.context = ctx; + for(const auto &fg : traverse(fused_kernel_graph)) + { + ClKernelBlueprint bp{}; + for(const auto &kernel : traverse(*fg)) + { + const auto st = kernel->generate(bp); + if(!bool(st)) + { + return st; + } + } + auto st = set_tile_info(bp, fg->get_root_kernel()->config().tile_desc); + if(!bool(st)) + { + return st; + } + st = generate_store(bp, fused_kernel_graph, *fg); + if(!bool(st)) + { + return st; + } + + ClKernelCode code{}; + st = build(code, ClCodeBuilderContext{ ctx.gpu_info }, bp); + if(!bool(st)) + { + return st; + } + const auto bp_graph = get_dependency_graph(bp); + + // Get tensor info + std::vector<Id> workload_src_tensors{}; + for(const auto &src_t_id : fused_kernel_graph.fg_dependency.src_tensors(fg->id)) + { + const auto src_t = fused_kernel_graph.original_graph->get_tensor(src_t_id); + // Get corresponding kernel arg descriptor + const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(src_t->id)); + const auto kernel_t_id = workload.add_workload_tensor(src_t->desc, src_t->memory_type, src_t->memory_info, arg_desc, src_t->id); + workload_src_tensors.push_back(kernel_t_id); + } + std::vector<Id> workload_dst_tensors{}; + for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg->id)) + { + const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id); + // Get corresponding kernel arg descriptor + const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(dst_t->id)); + const auto kernel_t_id = workload.add_workload_tensor(dst_t->desc, dst_t->memory_type, dst_t->memory_info, arg_desc, dst_t->id); + workload_dst_tensors.push_back(kernel_t_id); + } + + workload.add_unit_workload(fg->get_root_kernel()->config().stage, code, workload_src_tensors, workload_dst_tensors); + } + + return Status{}; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h new file mode 100644 index 0000000000..4bd3cd9d8b --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/experimental/DependencyGraph.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" +#include "support/DeepCopy.h" + +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct ClKernelFusionGroup; + +/** A const view of a subgraph of the @ref ClKernelGraph to be fused together + * + */ +struct ClKernelFusionGroup +{ +public: + using Id = DependencyGraph::Id; + + ClKernelFusionGroup() = default; + ClKernelFusionGroup(Id id) + : id{ id }, graph{}, fused_kernels{}, tensors{} + { + } + ~ClKernelFusionGroup() = default; + + void set_id(Id i) + { + id = i; + } + + Id add_fused_kernel(const ClKernel *kernel) + { + /// PRE: Acyclicity ensured by DependencyGraph + /// PRE: Connectedness ensured by DependencyGraph + /// PRE: Single-rootedness ensured by User + std::vector<Id> src_tensors; + for(const auto t : kernel->tensors().get_const_src_tensors()) + { + auto id = graph.add_tensor(t->id); + if(tensors.find(id) == tensors.end()) + { + tensors[id] = t; + } + src_tensors.push_back(id); + } + std::vector<Id> dst_tensors; + for(const auto t : kernel->tensors().get_const_dst_tensors()) + { + auto id = graph.add_tensor(t->id); + if(tensors.find(id) == tensors.end()) + { + tensors[id] = t; + } + dst_tensors.push_back(id); + } + auto id = graph.add_operator(src_tensors, dst_tensors); + fused_kernels[id.second] = kernel; + return id.second; + } + + const ClKernel *get_root_kernel() const + { + auto root_kernels = graph.get_root_ops(); + ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1); + return fused_kernels.at(root_kernels.at(0)); + } + + std::vector<const ClKernelTensor *> get_src_tensors() const + { + std::vector<const ClKernelTensor *> src_tensors; + for(auto tensor_id : graph.src_tensors()) + { + src_tensors.push_back(tensors.at(tensor_id)); + } + return src_tensors; + } + + std::vector<const ClKernelTensor *> get_dst_tensors() const + { + std::vector<const ClKernelTensor *> dst_tensors; + for(auto tensor_id : graph.dst_tensors()) + { + dst_tensors.push_back(tensors.at(tensor_id)); + } + return dst_tensors; + } + + friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) + { + return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors; + } + + Id id{}; + DependencyGraph graph{}; // A subgraph of the original ClKernelGraph + std::map<Id, const ClKernel *> fused_kernels{}; + std::map<Id, const ClKernelTensor *> tensors{}; +}; + +std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group); + +struct ClFusedKernelGraph +{ +public: + using Id = DependencyGraph::Id; + + using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>; + + ClFusedKernelGraph() = default; + ~ClFusedKernelGraph() = default; + ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default; + ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default; + ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default; + ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default; + + friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1) + { + /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison. + /// The fusion groups can already guarantee the equivalence of fusion + /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s + return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups; + } + + Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels) + { + auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>(); + for(const auto k : fused_kernels) + { + fg->add_fused_kernel(k); + } + const auto src_tensors = fg->get_src_tensors(); + const auto dst_tensors = fg->get_dst_tensors(); + std::vector<Id> inputs{}; + std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel) + { + return fg_dependency.add_tensor(kernel->id); + }); + std::vector<Id> outputs{}; + std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel) + { + return fg_dependency.add_tensor(kernel->id); + }); + const auto id = fg_dependency.add_operator(inputs, outputs); + fg->set_id(id.second); + fusion_groups[id.second] = std::move(fg); + return id.second; + } + + Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1) + { + /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold + ClKernelFusionGroup *fg_src{}; + ClKernelFusionGroup *fg_dst{}; + // Find fg_src (parent / root) and fg_dst (child / non-root) + if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) + { + fg_src = &fg0; + fg_dst = &fg1; + } + else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) + { + fg_src = &fg1; + fg_dst = &fg0; + } + else + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; + } + + for(const auto &t : fg_dependency.src_tensors(fg_dst->id)) + { + if(!is_in(t, fg_dependency.dst_tensors(fg_src->id))) + { + // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src + + // Before: + // fg_src + // | + // .. t1 + // | | + // -> fg_dst <- + // + // After: + // fg_src <---t1 + // + const auto st = link_src_tensors(fg_src->id, { t }); + if(!bool(st)) + { + return st; + } + } + else + { + const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t); + if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id) + { + // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst + // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3)) + + // Before: + // fg_src + // | + // t0 + // | + // -> fg_dst + // + // After: + // fg_src + // + const auto st = remove_fg_tensor(t); + if(!bool(st)) + { + return st; + } + } + else + { + // If the tensors ARE in between fg_src and fg_dst + // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src + + // Before: + // fg_src + // | + // t0 + // | + // |----------- + // | | + // -> fg_dst -> fg_other + // + // After: + // fg_src + // | + // t0 + // | + // -> fg_other + // + + // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an + // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory + // so that any unfused fg's (fg_other in this case) can read it. + // So all this means that we not only can STORE the tensors at the "end" of a fusion group, + // but also any other tensors that are not source tensors. And all tensors that are STORED (exported), + // can be termed "dst tensors" to a fusion group + void(); + } + } + } + + for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) + { + // Link any outgoing tensors of fg_dst to fg_src + + // Before: + // fg_src + // | + // .. + // | + // -> fg_dst + // | + // |-------- + // | | + // |-> t0 |-> t1 + // + // After: + // fg_src + // | + // |-------- + // | | + // |-> t0 |-> t1 + // + const auto st = link_dst_tensors(fg_src->id, { t }); + if(!bool(st)) + { + return st; + } + } + + // Merge fg_dst's graph into fg_src's graph + for(const auto kernel : traverse(*fg_dst)) + { + fg_src->add_fused_kernel(kernel); + } + + const auto st = remove_fg(fg_dst->id); + return st; + } + Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const + { + /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel + /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels) + /// Note that this does not apply to fusion groups + /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's + /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group + /// INV0: All Fusion groups have a single root + /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph + /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph + /// INV3: All non-roots are Simple kernels + /// INV4: All non roots' dst tensors have the same shape as that of the root kernel + /// INV5: All kernels within a fusion group have the same UnitWorkloadStage + const ClKernelFusionGroup *fg_src {}; + const ClKernelFusionGroup *fg_dst{}; + + // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other + // This guarantess INV0 + // This also finds fg_src (parent / root) and fg_dst (child / non-root) + if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) + { + fg_src = &fg0; + fg_dst = &fg1; + } + else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) + { + fg_src = &fg1; + fg_dst = &fg0; + } + else + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; + } + + // Find unconnected tensors between fg_src and fg_dst + std::vector<Id> unconnected_tensors{}; + for(const auto &t : fg_dependency.dst_tensors(fg_src->id)) + { + if(!is_in(t, fg_dependency.src_tensors(fg_dst->id))) + { + unconnected_tensors.push_back(t); + } + } + + // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst + // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups + for(const auto &t : unconnected_tensors) + { + if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id)) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" }; + } + } + + // Check 2: All non-root fgs are simple. Ensure INV3 + if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" }; + } + + // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4 + const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id); + ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor) + const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc; + + for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) + { + const auto t_info = original_graph->get_tensor(t)->desc; + if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0)) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" }; + } + } + + // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5 + if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage)) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" }; + } + + return Status{}; + } + + const ClKernelGraph *original_graph{}; + DependencyGraph fg_dependency{}; + KernelFusionGroupMap fusion_groups{}; + // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups. + +private: + Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors) + { + for(auto t : src_tensors) + { + fg_dependency.link_input(fg, t); + } + return Status{}; + } + Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors) + { + for(auto t : dst_tensors) + { + fg_dependency.link_output(fg, t); + } + return Status{}; + } + Status remove_fg(Id fg) + { + fg_dependency.remove_operator(fg); + fusion_groups.erase(fg); + return Status{}; + } + Status remove_fg_tensor(Id tensor) + { + fg_dependency.remove_tensor(tensor); + return Status{}; + } +}; + +std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph); +std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph); + +std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph); + +Status fuse(ClFusedKernelGraph &fused_kernel_graph); + +Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg); + +Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h new file mode 100644 index 0000000000..cdd2b2e552 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H + +#include "arm_compute/core/experimental/OperatorGraph.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct ClDirectConv2dKernelDescriptor +{ + friend bool operator==(const ClDirectConv2dKernelDescriptor &desc0, const ClDirectConv2dKernelDescriptor &desc1) + { + return desc0.conv2d == desc1.conv2d; + } + Conv2dDescriptor conv2d{}; +}; + +struct ClEltwiseAddKernelDescriptor +{ + friend bool operator==(const ClEltwiseAddKernelDescriptor &desc0, const ClEltwiseAddKernelDescriptor &desc1) + { + return desc0.add == desc1.add; + } + AddDescriptor add{}; +}; +struct ClActivationKernelDescriptor +{ + friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &) + { + return true; + } +}; + +enum class ClippingStrategy +{ + TOP_LEFT, + TOP_RIGHT, + BOTTOM_LEFT, + BOTTOM_RIGHT, +}; +/** Component: Store */ +struct TileDescriptor +{ + Size2D tile_dims{}; + Size2D boundaries{}; + ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT }; + + TileDescriptor() + { + } + + TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip) + : tile_dims(dims), boundaries(bound), clipping(clip) + { + } + + bool empty() const + { + return (tile_dims.area() == 0) || (boundaries.area() == 0); + } + friend bool operator==(const TileDescriptor &tile0, const TileDescriptor &tile1) + { + return tile0.tile_dims == tile1.tile_dims && tile0.boundaries == tile1.boundaries && tile0.clipping == tile1.clipping; + } +}; +enum class StoreType +{ + VStore, + VStorePartial, + StoreRow, + ConvertStoreRow, + StoreBlock, + ConvertStoreBlock, + StoreRowPartial, + StoreBlockPartial, + StoreBlockBoundaryAware, + StoreVectorSelect, + TStoreIndirectWidthSelect +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp new file mode 100644 index 0000000000..8aaf0946bb --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" + +#include "support/Cast.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClDirectConv2dKernel::generate(ClKernelBlueprint &bp) const +{ + const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst); + ArgumentID input_id; + add_tensor(bp, input->desc, input_id, input->id); + ArgumentID weight_id; + add_tensor(bp, weight->desc, weight_id, weight->id); + ArgumentID bias_id = g_arg_placeholder; + if(bias != nullptr) + { + add_tensor(bp, bias->desc, bias_id, bias->id); + } + ArgumentID dst_id; + add_tensor(bp, dst->desc, dst_id, dst->id); + + add_kcomp_direct_conv2d(bp, desc, input_id, weight_id, bias_id, dst_id); + return Status{}; +} +Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc) +{ + // 1. Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, biases); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // weights shape is correct + const DataLayout data_layout = src->data_layout(); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + + // dst shape is correct + PadStrideInfo legacy_pad_stride(conv2d_desc.conv2d.stride.x(), conv2d_desc.conv2d.stride.y(), conv2d_desc.conv2d.pad.left, conv2d_desc.conv2d.pad.right, conv2d_desc.conv2d.pad.top, + conv2d_desc.conv2d.pad.bottom, DimensionRoundingType{}); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride)); + + // biases shape is correct + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, + "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + return Status{}; +} + +bool ClDirectConv2dKernel::operator==(const ClKernel &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast<const ClDirectConv2dKernel *>(&other); + return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; +} + +Status ClAddKernel::generate(ClKernelBlueprint &bp) const +{ + const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ArgumentID lhs_id; + add_tensor(bp, lhs->desc, lhs_id, lhs->id); + ArgumentID rhs_id; + add_tensor(bp, rhs->desc, rhs_id, rhs->id); + ArgumentID dst_id; + add_tensor(bp, dst->desc, dst_id, dst->id); + + add_kcomp_eltwise_add(bp, desc, lhs_id, rhs_id, dst_id); + return Status{}; +} + +Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst) +{ + // 1. Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); + + const bool in_place = (lhs == dst) || (rhs == dst); + const bool src0_in_place = in_place && (lhs == dst); + + // dst shape is correct + const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src0_in_place ? lhs->tensor_shape() : rhs->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + + // 2. Check support level + + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); + + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); + + return Status{}; +} + +bool ClAddKernel::operator==(const ClKernel &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast<const ClAddKernel *>(&other); + return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; +} + +std::vector<const ClKernel *> traverse(const ClKernelGraph &graph) +{ + std::vector<const ClKernel *> kernels; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.kernels.at(pack.op).get()); + } + return kernels; +} +std::vector<ClKernel *> traverse(ClKernelGraph &graph) +{ + std::vector<ClKernel *> kernels; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.kernels.at(pack.op).get()); + } + return kernels; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h new file mode 100644 index 0000000000..1e14afb266 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/core/experimental/DependencyGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h" +#include "support/DeepCopy.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct ClKernelGraph; +class ClKernelBlueprint; + +enum class Complexity +{ + Simple, + Complex +}; + +/** Configurations for ClKernel + * + */ +struct ClKernelConfig +{ + UnitWorkloadStage stage{}; + TileDescriptor tile_desc{}; + StoreType store_type{}; + friend bool operator==(const ClKernelConfig &config0, const ClKernelConfig &config1) + { + return config0.stage == config1.stage && config0.tile_desc == config1.tile_desc && config0.store_type == config1.store_type; + } +}; + +struct ClKernelTensor +{ +public: + using Id = DependencyGraph::Id; + ClKernelTensor() = default; + ClKernelTensor(Id id, ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info) + : id{ id }, desc{ desc }, memory_type{ memory_type }, memory_info{ memory_info } + { + } + bool operator==(const ClKernelTensor &other) const + { + return desc == other.desc; + } + + Id id{}; + ITensorInfo *desc{}; + MemoryType memory_type{}; + AuxMemoryInfo memory_info{}; +}; + +struct ClKernel +{ +public: + using Id = DependencyGraph::Id; + ClKernel() = default; + virtual ~ClKernel() = default; + ClKernel(const ClKernel &kernel) = default; + ClKernel &operator=(const ClKernel &kernel) = default; + ClKernel(ClKernel &&kernel) = default; + ClKernel &operator=(ClKernel &&kernel) = default; + ClKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ITensorDescPack<ClKernelTensor> &tensors) + : _graph{ graph }, _id{ id }, _config{ config }, _tensors{ tensors } + { + } + virtual bool operator==(const ClKernel &other) const = 0; + virtual Complexity complexity() const = 0; + virtual Status generate(ClKernelBlueprint &bp) const = 0; + Id id() const + { + return _id; + } + ITensorDescPack<ClKernelTensor> tensors() const + { + return _tensors; + } + ClKernelConfig config() const + { + return _config; + } + +protected: + const ClKernelGraph *_graph {}; + Id _id{}; + ClKernelConfig _config{}; + ITensorDescPack<ClKernelTensor> _tensors{}; +}; + +struct ClDirectConv2dKernel : public ClKernel +{ +public: + Complexity complexity() const override + { + return Complexity::Complex; + } + ClDirectConv2dKernel() = default; + ~ClDirectConv2dKernel() override = default; + ClDirectConv2dKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig config, const ClDirectConv2dKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors) + : ClKernel{ graph, id, config, tensors }, desc{ desc } + { + } + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc); + bool operator==(const ClKernel &other) const override; + Status generate(ClKernelBlueprint &bp) const override; + + ClDirectConv2dKernelDescriptor desc{}; +}; + +struct ClAddKernel : public ClKernel +{ +public: + Complexity complexity() const override + { + return Complexity::Simple; + } + ClAddKernel() = default; + ~ClAddKernel() override = default; + ClAddKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClEltwiseAddKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors) + : ClKernel{ graph, id, config, tensors }, desc{ desc } + { + } + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst); + bool operator==(const ClKernel &other) const override; + Status generate(ClKernelBlueprint &bp) const override; + + ClEltwiseAddKernelDescriptor desc{}; +}; + +struct ClKernelGraph +{ +public: + using Id = DependencyGraph::Id; + using KernelMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernel>>; + using KernelTensorMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelTensor>>; + + ClKernelGraph() = default; + ~ClKernelGraph() = default; + + friend bool operator==(const ClKernelGraph &graph0, const ClKernelGraph &graph1) + { + return graph0.graph == graph1.graph && graph0.kernels == graph1.kernels && graph0.tensors == graph1.tensors; + } + + Status add_kernel_tensor(ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info, Id &tensor_id, Id merge_point = DependencyGraph::empty_id()) + { + tensor_id = graph.add_tensor(merge_point); + if(tensors.find(tensor_id) == tensors.end()) + { + tensors[tensor_id] = utils::memory::make_deep_unique<ClKernelTensor, ClKernelTensor>(tensor_id, desc, memory_type, memory_info); + } + return Status{}; + } + + template <typename ContentT, typename KernelDescT> + Status add_kernel(const ClKernelConfig &config, const KernelDescT &desc, const ITensorDescPack<ClKernelTensor> &tensors, Id &kernel_id) + { + const auto src_tensors = tensors.get_const_src_tensors(); + const auto dst_tensors = tensors.get_const_dst_tensors(); + std::vector<Id> src_tensor_ids{}; + std::vector<Id> dst_tensor_ids{}; + for(const auto &t : src_tensors) + { + src_tensor_ids.push_back(t->id); + } + for(const auto &t : dst_tensors) + { + dst_tensor_ids.push_back(t->id); + } + kernel_id = graph.add_operator(src_tensor_ids, dst_tensor_ids).second; + auto k = utils::memory::make_deep_unique<ClKernel, ContentT>(this, kernel_id, config, desc, tensors); + kernels[kernel_id] = std::move(k); + return Status{}; + } + + ClKernel *get_kernel(Id id) + { + return kernels.at(id).get(); + } + const ClKernel *get_kernel(Id id) const + { + return kernels.at(id).get(); + } + + ClKernelTensor *get_tensor(Id id) + { + return tensors.at(id).get(); + } + const ClKernelTensor *get_tensor(Id id) const + { + return tensors.at(id).get(); + } + + DependencyGraph graph{}; + KernelMap kernels{}; + KernelTensorMap tensors{}; +}; +using Id = DependencyGraph::Id; + +std::vector<const ClKernel *> traverse(const ClKernelGraph &graph); +std::vector<ClKernel *> traverse(ClKernelGraph &graph); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp new file mode 100644 index 0000000000..e97cf88b79 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/ClWorkload.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx) +{ + workload.context = ctx; + ClKernelGraph kernel_graph; + workload.status = validate(op_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + workload.status = translate(kernel_graph, *op_graph.impl()); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + ClFusedKernelGraph fused_k_graph; + std::tie(workload.status, fused_k_graph) = init_fusion_graph(kernel_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + workload.status = fuse(fused_k_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + workload.status = generate(workload, ctx, fused_k_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + + // Get operator tensor id to workload tensor id map + const auto op_tensor_to_kernel_tensor = fused_k_graph.original_graph->graph.get_merge_points(); + const auto kernel_tensor_to_workload_tensor = workload.graph.get_merge_points(); + for(const auto op_t : op_graph.impl()->graph.src_tensors()) + { + const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t); + const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t); + workload.op_tensor_id_lut[workload_t] = op_t; + } + for(const auto op_t : op_graph.impl()->graph.dst_tensors()) + { + const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t); + const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t); + workload.op_tensor_id_lut[workload_t] = op_t; + } + return workload.status; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp new file mode 100644 index 0000000000..2e8292bbfb --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/DependencyGraph.h" + +#include <algorithm> +#include <deque> +#include <set> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +DependencyGraph::DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points) + : _adj_src_tensors{ adj_src_tensors }, _adj_dst_tensors{ adj_dst_tensors }, _adj_src_ops{ adj_src_ops }, _adj_dst_ops{ adj_dst_ops }, _merge_to_internal{ merge_points }, _operator_id{}, _tensor_id{} +{ +} +DependencyGraph::DependencyGraph(const std::vector<Id> &imported_tensors) + : _adj_src_tensors{}, _adj_dst_tensors{}, _adj_src_ops{}, _adj_dst_ops{}, _merge_to_internal{}, _operator_id{}, _tensor_id{} +{ + for(auto t : imported_tensors) + { + _adj_src_ops[t] = {}; + _adj_dst_ops[t] = {}; + } +} + +Status DependencyGraph::update_merge_point(Id t_id, Id merge_point) +{ + if(_merge_to_internal.find(merge_point) == _merge_to_internal.end()) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Merge point does not exist" }; + } + _merge_to_internal[merge_point] = t_id; + return Status{}; +} + +DependencyGraph::Id DependencyGraph::add_tensor(Id merge_tensor) +{ + Id new_tensor{ empty_id() }; + if(merge_tensor != empty_id()) + { + if(_merge_to_internal.find(merge_tensor) != _merge_to_internal.end()) + { + new_tensor = _merge_to_internal[merge_tensor]; + } + else + { + new_tensor = insert_new_tensor(); + _merge_to_internal[merge_tensor] = new_tensor; + } + } + else + { + new_tensor = insert_new_tensor(); + } + return new_tensor; +} + +void DependencyGraph::remove_tensor(Id tensor) +{ + for(auto src_op : _adj_src_ops.at(tensor)) + { + auto &dst_tensors = _adj_dst_tensors.at(src_op); + dst_tensors.erase( + std::remove(std::begin(dst_tensors), std::end(dst_tensors), tensor), + std::end(dst_tensors)); + } + for(auto dst_op : _adj_dst_ops.at(tensor)) + { + auto &src_tensors = _adj_src_tensors.at(dst_op); + src_tensors.erase( + std::remove(std::begin(src_tensors), std::end(src_tensors), tensor), + std::end(src_tensors)); + } + _adj_src_ops.erase(tensor); + _adj_dst_ops.erase(tensor); +} + +std::pair<Status, DependencyGraph::Id> DependencyGraph::add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs) +{ + Id new_op = insert_new_op(); + for(Id tensor : inputs) + { + link_input(new_op, tensor); + } + for(Id tensor : outputs) + { + link_output(new_op, tensor); + } + + // Use topological sort in order to detect possible loops / cycles. + // NOTE: This is unscalable. We'll need to have a better way of detecting loops or relax this invariant during operation, and add a validate method instead + return std::pair<Status, DependencyGraph::Id>(topological_sort().first, new_op); +} + +void DependencyGraph::remove_operator(Id op) +{ + for(auto src_tensor : _adj_src_tensors.at(op)) + { + auto &dst_ops = _adj_dst_ops.at(src_tensor); + dst_ops.erase( + std::remove(std::begin(dst_ops), std::end(dst_ops), op), + std::end(dst_ops)); + } + for(auto dst_tensor : _adj_dst_tensors.at(op)) + { + auto &src_ops = _adj_src_ops.at(dst_tensor); + src_ops.erase( + std::remove(std::begin(src_ops), std::end(src_ops), op), + std::end(src_ops)); + } + _adj_src_tensors.erase(op); + _adj_dst_tensors.erase(op); +} + +std::map<DependencyGraph::Id, DependencyGraph::Id> DependencyGraph::get_merge_points() const +{ + return _merge_to_internal; +} + +std::vector<DependencyGraph::Id> DependencyGraph::get_root_ops() const +{ + std::vector<Id> ops{}; + const auto op_list = all_ops(); + + for(auto op : op_list) + { + if(src_ops(op).empty()) + { + ops.emplace_back(op); + } + } + return ops; +} + +std::vector<DependencyGraph::Id> DependencyGraph::get_dst_ops() const +{ + std::vector<Id> ops{}; + const auto op_list = all_ops(); + + for(auto op : op_list) + { + if(dst_ops(op).empty()) + { + ops.emplace_back(op); + } + } + return ops; +} + +std::vector<DependencyGraph::Id> DependencyGraph::src_tensors(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_src_tensors.at(op); +} + +std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_dst_tensors.at(op); +} + +std::vector<DependencyGraph::Id> DependencyGraph::src_tensors() const +{ + std::vector<Id> tensors; + for(auto tensor_src_ops : _adj_src_ops) + { + if(tensor_src_ops.second.empty()) + tensors.push_back(tensor_src_ops.first); + } + return tensors; +} + +std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors() const +{ + std::vector<Id> tensors; + for(auto tensor_dst_ops : _adj_dst_ops) + { + if(tensor_dst_ops.second.empty()) + tensors.push_back(tensor_dst_ops.first); + } + return tensors; +} + +std::vector<DependencyGraph::Id> DependencyGraph::src_ops_from_tensor(Id tensor) const +{ + return _adj_src_ops.at(tensor); +} +std::vector<DependencyGraph::Id> DependencyGraph::dst_ops_from_tensor(Id tensor) const +{ + return _adj_dst_ops.at(tensor); +} + +std::vector<DependencyGraph::Id> DependencyGraph::all_ops() const +{ + std::vector<Id> ops{}; + std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it) + { + return it.first; + }); + return ops; +} + +bool DependencyGraph::path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const +{ + for(auto child_op : dst_ops_from_tensor(src_tensor)) + { + if(path_exists_from_op_to_op(child_op, dst_op)) + { + return true; + } + } + return false; +} + +bool DependencyGraph::path_exists_from_op_to_op(Id src_op, Id dst_op) const +{ + if(src_op == dst_op) + { + return true; + } + if(is_in(src_op, get_dst_ops())) + { + return false; + } + for(auto child_tensor : dst_tensors(src_op)) + { + if(path_exists_from_tensor_to_op(child_tensor, dst_op)) + { + return true; + } + } + return false; +} + +std::vector<DependencyGraph::Id> DependencyGraph::all_tensors() const +{ + std::vector<Id> tensors{}; + std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it) + { + return it.first; + }); + return tensors; +} + +unsigned int DependencyGraph::number_of_ops() const +{ + return _adj_src_tensors.size(); +} + +unsigned int DependencyGraph::number_of_tensors() const +{ + return _adj_src_ops.size(); +} + +DependencyGraph::Id DependencyGraph::insert_new_tensor() +{ + Id new_tensor = _tensor_id.alloc(); + _adj_src_ops[new_tensor] = {}; + _adj_dst_ops[new_tensor] = {}; + return new_tensor; +} +DependencyGraph::Id DependencyGraph::insert_new_op() +{ + Id new_op = _operator_id.alloc(); + _adj_src_tensors[new_op] = {}; + _adj_dst_tensors[new_op] = {}; + return new_op; +} +void DependencyGraph::link_input(Id op, Id in_tensor) +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + ARM_COMPUTE_ERROR_ON(!tensor_exists(in_tensor)); + ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); + _adj_src_tensors[op].push_back(in_tensor); + _adj_dst_ops[in_tensor].push_back(op); +} +void DependencyGraph::link_output(Id op, Id out_tensor) +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + ARM_COMPUTE_ERROR_ON(!tensor_exists(out_tensor)); + ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); + _adj_dst_tensors[op].push_back(out_tensor); + _adj_src_ops[out_tensor].push_back(op); +} +bool DependencyGraph::tensor_exists(Id tensor) const +{ + return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end(); +} +bool DependencyGraph::operator_exists(Id op) const +{ + return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); +} + +bool DependencyGraph::is_src_tensor(Id tensor) const +{ + if(!tensor_exists(tensor)) + { + return false; + } + return _adj_src_ops.at(tensor).empty(); +} + +bool DependencyGraph::is_dst_tensor(Id tensor) const +{ + if(!tensor_exists(tensor)) + { + return false; + } + return _adj_dst_ops.at(tensor).empty(); +} +bool DependencyGraph::is_src_tensor_of(Id op, Id tensor) const +{ + if(!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_inputs = src_tensors(op); + return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end(); +} +bool DependencyGraph::is_dst_tensor_of(Id op, Id tensor) const +{ + if(!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_outputs = dst_tensors(op); + return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end(); +} +bool DependencyGraph::are_connected(Id op, Id tensor) const +{ + return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor); +} +std::vector<DependencyGraph::Id> DependencyGraph::src_ops(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector<Id> ops{}; + for(Id src_tensor : src_tensors(op)) + { + ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); + } + return ops; +} + +std::vector<DependencyGraph::Id> DependencyGraph::dst_ops(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector<Id> ops{}; + for(Id dst_tensor : _adj_dst_tensors.at(op)) + { + ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); + } + return ops; +} + +std::pair<Status, std::vector<DependencyGraph::OpPack>> DependencyGraph::topological_sort() const +{ + // Incident degree (number of source operators to an op) + std::map<Id, unsigned int> in_degree{}; + std::set<Id> visited_ops{}; + std::deque<Id> zero_in_degree_ops{}; + std::vector<OpPack> sorted_op_packs{}; + for(auto op : all_ops()) + { + const auto degree = src_ops(op).size(); + in_degree[op] = degree; + if(degree == 0) + { + zero_in_degree_ops.push_back(op); + visited_ops.insert(op); + } + } + + while(!zero_in_degree_ops.empty()) + { + const Id op = zero_in_degree_ops.front(); + zero_in_degree_ops.pop_front(); + sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); + + for(const auto next_op : dst_ops(op)) + { + if(in_degree[next_op] > 0) + { + in_degree[next_op]--; + } + if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end()) + { + zero_in_degree_ops.push_back(next_op); + visited_ops.insert(op); + } + } + } + + // If there are remaining ops with in_degree > 0, then it's indication that there are cycles in the graph + Status st{}; + if(sorted_op_packs.size() != number_of_ops()) + { + st = Status{ ErrorCode::RUNTIME_ERROR, "Cycles or loops are not allowed in a DependencyGraph" }; + } + return std::make_pair(st, sorted_op_packs); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h new file mode 100644 index 0000000000..bfa2eacfed --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H + +#include <cstddef> +#include <unordered_map> +#include <vector> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +template <typename TDesc> +class ITensorDescPack +{ +public: + struct PackElement + { + PackElement() = default; + ~PackElement() = default; + PackElement(const PackElement &) = default; + PackElement &operator=(const PackElement &) = default; + PackElement(PackElement &&) = default; + PackElement &operator=(PackElement &&) = default; + PackElement(int id, TDesc *tensor) + : id(id), tensor(tensor), ctensor(nullptr) + { + } + PackElement(int id, const TDesc *ctensor) + : id(id), tensor(nullptr), ctensor(ctensor) + { + } + + int id{ -1 }; + TDesc *tensor{ nullptr }; + const TDesc *ctensor{ nullptr }; + + friend bool operator==(const PackElement &elem0, const PackElement &elem1) + { + const bool same_ctensor = (elem0.tensor == nullptr && elem1.tensor == nullptr && elem0.ctensor != nullptr && elem1.ctensor != nullptr && *elem0.ctensor == *elem1.ctensor); + const bool same_tensor = (elem0.ctensor == nullptr && elem1.ctensor == nullptr && elem0.tensor != nullptr && elem1.tensor != nullptr && *elem0.tensor == *elem1.tensor); + + return elem0.id == elem1.id && (same_ctensor || same_tensor); + } + }; + +public: + /** Default Constructor */ + ITensorDescPack() = default; + ~ITensorDescPack() = default; + ITensorDescPack<TDesc>(const ITensorDescPack<TDesc> &other) = default; + ITensorDescPack<TDesc> &operator=(const ITensorDescPack<TDesc> &other) = default; + ITensorDescPack<TDesc>(ITensorDescPack<TDesc> &&other) = default; + ITensorDescPack<TDesc> &operator=(ITensorDescPack<TDesc> &&other) = default; + /** Initializer list Constructor */ + ITensorDescPack(std::initializer_list<PackElement> l) + : _pack{} + { + for(auto &e : l) + { + _pack[e.id] = e; + } + } + /** Add tensor to the pack + * + * @param[in] id ID/type of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_tensor(int id, TDesc *tensor) + { + _pack[id] = PackElement(id, tensor); + } + + /** Add const tensor to the pack + * + * @param[in] id ID/type of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_const_tensor(int id, const TDesc *tensor) + { + _pack[id] = PackElement(id, tensor); + } + /** Get tensor of a given id from the pac + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor if exist and is non-const else nullptr + */ + TDesc *get_tensor(int id) + { + auto it = _pack.find(id); + return it != _pack.end() ? it->second.tensor : nullptr; + } + /** Get constant tensor of a given id + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor if exist and is const else nullptr + */ + const TDesc *get_const_tensor(int id) const + { + auto it = _pack.find(id); + if(it != _pack.end()) + { + return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; + } + return nullptr; + } + /** Remove the tensor stored with the given id + * + * @param[in] id ID of tensor to remove + */ + void remove_tensor(int id) + { + _pack.erase(id); + } + /** Pack size accessor + * + * @return Number of tensors registered to the pack + */ + size_t size() const + { + return _pack.size(); + } + /** Checks if pack is empty + * + * @return True if empty else false + */ + bool empty() const + { + return _pack.empty(); + } + + /** Get the ACL_SRC_* tensors + * + * @return std::vector<TDesc *> + */ + std::vector<TDesc *> get_src_tensors() + { + std::vector<TDesc *> src_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_tensor(id); + if(tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the const ACL_SRC_* tensors + * + * @return std::vector<const TDesc *> + */ + std::vector<const TDesc *> get_const_src_tensors() const + { + std::vector<const TDesc *> src_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_const_tensor(id); + if(tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the ACL_DST_* tensors + * + * @return std::vector<TDesc *> + */ + std::vector<TDesc *> get_dst_tensors() + { + std::vector<TDesc *> dst_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_tensor(id); + if(tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + /** Get the const ACL_DST_* tensors + * + * @return std::vector<const TDesc *> + */ + std::vector<const TDesc *> get_const_dst_tensors() const + { + std::vector<const TDesc *> dst_tensors{}; + for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_const_tensor(id); + if(tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + + friend bool operator==(const ITensorDescPack<TDesc> &pack0, const ITensorDescPack<TDesc> &pack1) + { + return pack0._pack == pack1._pack; + } + +private: + std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */ +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp new file mode 100644 index 0000000000..4b91c0f156 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, MemoryType memory_type, AuxMemoryInfo memory_info, + DependencyGraph::Id &id) +{ + ARM_COMPUTE_UNUSED(op_graph); + return k_graph.add_kernel_tensor(op_tensor.desc, memory_type, memory_info, id, op_tensor.id); +} + +Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, DependencyGraph::Id &id) +{ + // For a tensor t + // 1. If t is a src tensor of the entire op graph, then it's Core. + // (Optimisation opportunity, if we guanrantee that all translate methods are called in topological order, we can always assign t to Core. + // Because even if the op is non-root (which would mean t should be an Aux tensor), the src tensors would be already be determined by the ancestor ops (topological order), and thus would not be overriden by it) + // 2. If t is a dst tensor of the entire op graph, then it's Core. + // 3. Aux tensor with Persistent and Prepare lifetime is manually specified + // 4. All other ts not captured by the above are assigned Aux, with lifetime of Temporary. + // kernel_graph.add_kernel_tensor(input->desc, ); + bool is_src_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.src_tensors()); + bool is_dst_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.dst_tensors()); + MemoryType memory_type; + AuxMemoryInfo memory_info; + if(is_src_tensor_of_graph || is_dst_tensor_of_graph) + { + memory_type = MemoryType::Core; + } + else + { + memory_type = MemoryType::Auxiliary; + memory_info.lifetime = AuxMemoryLifetime::Temporary; + memory_info.size = op_tensor.desc->total_size(); + } + return add_kernel_tensor(k_graph, op_graph, op_tensor, memory_type, memory_info, id); +} + +/** Get the suitable kernel size for using direct convolution method with NHWC data layout. + * + * @note Duplicate of the function with the same name in src/gpu/cl/operators/ClConv2d.cpp + * + * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function + * + * @param[in] gpu_target GPU target + * + * @return the suitable kernel size for using direct convolution method with NHWC data layout + */ +size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) +{ + switch(gpu_target) + { + case arm_compute::GPUTarget::G76: + case arm_compute::GPUTarget::G77: + case arm_compute::GPUTarget::G78: + return 5; + case arm_compute::GPUTarget::G71: + case arm_compute::GPUTarget::G72: + case arm_compute::GPUTarget::MIDGARD: + case arm_compute::GPUTarget::BIFROST: + return 7; + default: + return 5; + } +} +} // namespace + +bool operator==(const OpTensor &t0, const OpTensor &t1) +{ + return std::make_tuple(t0.id()) == std::make_tuple(t1.id()); +} +bool operator==(const Padding2D &pad0, const Padding2D &pad1) +{ + return std::make_tuple(pad0.top, pad0.right, pad0.bottom, pad0.left) == std::make_tuple(pad1.top, pad1.right, pad1.bottom, pad1.left); +} +bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1) +{ + return std::make_tuple(conv2d0.pad, conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.pad, conv2d1.stride, conv2d1.dilation); +} + +bool operator==(const AddDescriptor &, const AddDescriptor &) +{ + return std::make_tuple() == std::make_tuple(); // Currently two Add ops are always the same +} + +bool Conv2dContent::operator==(const OperatorContent &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast<const Conv2dContent *>(&other); + return desc == converted.desc; +} + +bool AddContent::operator==(const OperatorContent &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast<const AddContent *>(&other); + return desc == converted.desc; +} + +ConvolutionMethod Conv2dContent::select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target) +{ + // Modified from ClConv2d::get_convolution_method + + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); + + const PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); + const Size2D dilation = conv2d_desc.dilation; + + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ + using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; + using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; + + const std::vector<ConfigurationMethod> known_configs = + { + // Alexnet + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + // VGG16 / VGG19 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + }; + + const auto find_config = [&](ConfigurationMethod c) + { + const ConvolutionConfiguration config = c.first; + const PadStrideInfo info = std::get<3>(config); + const DataLayout data_layout = std::get<4>(config); + + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) + && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == legacy_pad_stride.pad_top() && info.pad_right() == legacy_pad_stride.pad_right() + && info.pad_bottom() == legacy_pad_stride.pad_bottom() && info.pad_left() == legacy_pad_stride.pad_left() && info.stride() == legacy_pad_stride.stride() && (data_layout == src->data_layout()); + }; + + std::vector<ConfigurationMethod>::const_iterator found; + if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + { + return (*found).second; + } + + if(dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else + { + if(src->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR("NCHW not supported"); + } + else + { + const bool is_direct_valid = bool(ClDirectConv2dKernel::validate(src, weights, nullptr, dst, ClDirectConv2dKernelDescriptor{ conv2d_desc })); + const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); + + // SRGAN case + if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv2d_desc.pad.top < 3) + && is_direct_valid) + { + return ConvolutionMethod::DIRECT; + } + + // Floating-point case: GeMM/Direct + if(is_data_type_float(src->data_type())) + { + // Get dst shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride); + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; + const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + + // Direct convolution case + if(is_direct_valid) + { + if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || gpu_target == arm_compute::GPUTarget::MIDGARD)) + { + if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) + { + return ConvolutionMethod::DIRECT; + } + } + else + { + if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) + { + return ConvolutionMethod::DIRECT; + } + } + } + + // Default case + return ConvolutionMethod::GEMM; + } + + // Generic case for quantized. Only GeMM + return ConvolutionMethod::GEMM; + } + } + return ConvolutionMethod::DIRECT; +} + +Status Conv2dContent::translate(ClKernelGraph &kernel_graph) const +{ + const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + const auto method = forced_method_enabled ? forced_method : Conv2dContent::select_conv_method(input->desc, weight->desc, dst->desc, desc, CLScheduler::get().target()); + switch(method) + { + case ConvolutionMethod::DIRECT: + { + return translate_direct_conv2d(kernel_graph); + } + default: + { + ARM_COMPUTE_RETURN_ERROR_MSG("Not implemented"); + } + } + return Status{}; +} +Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const +{ + const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst); + + ITensorDescPack<ClKernelTensor> tensors; + + DependencyGraph::Id input_id; + auto st = add_kernel_tensor(kernel_graph, *_graph, *input, input_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(input_id)); + + DependencyGraph::Id weight_id; + st = add_kernel_tensor(kernel_graph, *_graph, *weight, weight_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(weight_id)); + + if(bias != nullptr) + { + DependencyGraph::Id bias_id; + st = add_kernel_tensor(kernel_graph, *_graph, *bias, bias_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_2, kernel_graph.get_tensor(bias_id)); + } + + DependencyGraph::Id dst_id; + st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); + + DependencyGraph::Id direct_conv2d_id; + const auto kernel_desc = ClDirectConv2dKernelDescriptor{ desc }; + + st = ClDirectConv2dKernel::validate(input->desc, weight->desc, bias == nullptr ? nullptr : bias->desc, dst->desc, kernel_desc); + ARM_COMPUTE_RETURN_ON_ERROR(st); + + ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; + st = kernel_graph.add_kernel<ClDirectConv2dKernel>(config, kernel_desc, tensors, direct_conv2d_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + ARM_COMPUTE_UNUSED(direct_conv2d_id); + + return Status{}; +} + +Status AddContent::translate(ClKernelGraph &kernel_graph) const +{ + const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + + ITensorDescPack<ClKernelTensor> tensors; + + DependencyGraph::Id lhs_id; + auto st = add_kernel_tensor(kernel_graph, *_graph, *lhs, lhs_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(lhs_id)); + + DependencyGraph::Id rhs_id; + st = add_kernel_tensor(kernel_graph, *_graph, *rhs, rhs_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(rhs_id)); + + DependencyGraph::Id dst_id; + st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); + + DependencyGraph::Id add_id; + ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; + + st = ClAddKernel::validate(lhs->desc, rhs->desc, dst->desc); + ARM_COMPUTE_RETURN_ON_ERROR(st); + + st = kernel_graph.add_kernel<ClAddKernel>(config, ClEltwiseAddKernelDescriptor{ desc }, tensors, add_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + ARM_COMPUTE_UNUSED(add_id); + + return Status{}; +} + +std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph) +{ + std::vector<const OperatorContent *> ops; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + ops.push_back(graph.operators.at(pack.op).get()); + } + return ops; +} + +std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph) +{ + std::vector<OperatorContent *> ops; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + ops.push_back(graph.operators.at(pack.op).get()); + } + return ops; +} + +Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph) +{ + for(const auto &op : traverse(op_graph)) + { + const auto st = op->translate(kernel_graph); + ARM_COMPUTE_RETURN_ON_ERROR(st); + } + return Status{}; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute
\ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h new file mode 100644 index 0000000000..c33e189797 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL + +#include "arm_compute/core/experimental/ClWorkload.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h" + +#include "support/Cast.h" +#include "support/DeepCopy.h" + +#include <map> +#include <tuple> +#include <type_traits> + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +enum class OperatorComplexity +{ + Complex = 0, + Simple +}; + +struct ClKernelGraph; +struct OpTensorContent +{ +public: + using Id = DependencyGraph::Id; + OpTensorContent() = default; + OpTensorContent(Id id) + : id{ id }, desc{} + { + } + OpTensorContent(Id id, ITensorInfo *desc) + : id{ id }, desc{ desc } + { + } + ~OpTensorContent() = default; + OpTensorContent(const OpTensorContent &) = default; + OpTensorContent &operator=(const OpTensorContent &) = default; + OpTensorContent(OpTensorContent &&) = default; + OpTensorContent &operator=(OpTensorContent &&) = default; + bool operator==(const OpTensorContent &other) const + { + return desc == other.desc; + } + + const ITensorInfo *get_tensor_info() const + { + return desc; + } + ITensorInfo *get_tensor_info() + { + return desc; + } + + Id id{}; + ITensorInfo *desc{}; +}; + +struct OperatorContent +{ +public: + using Id = DependencyGraph::Id; + OperatorContent() = default; + OperatorContent(const OperatorGraph::Implementation *graph, Id id, const ITensorDescPack<OpTensorContent> &tensors) + : _graph{ graph }, _id{ id }, _tensors{ tensors } + { + } + OperatorContent(const OperatorContent &op) = default; + OperatorContent &operator=(const OperatorContent &op) = default; + OperatorContent(OperatorContent &&op) = default; + OperatorContent &operator=(OperatorContent &&op) = default; + virtual ~OperatorContent() = default; + virtual OperatorComplexity complexity() const = 0; + virtual bool operator==(const OperatorContent &other) const = 0; + virtual Status translate(ClKernelGraph &kernel_graph) const = 0; + +protected: + const OperatorGraph::Implementation *_graph {}; + Id _id{}; + ITensorDescPack<OpTensorContent> _tensors{}; +}; + +struct Conv2dContent : public OperatorContent +{ +public: + Conv2dContent() = default; + Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors) + : OperatorContent(graph, id, tensors), desc(desc), forced_method(), forced_method_enabled(false) + { + } + // Temporary. Do not need to pass ConvolutionMethod + Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors, ConvolutionMethod method) + : OperatorContent(graph, id, tensors), desc(desc), forced_method(method), forced_method_enabled(true) + { + } + ~Conv2dContent() = default; + Conv2dContent(const Conv2dContent &) = default; + Conv2dContent &operator=(const Conv2dContent &) = default; + Conv2dContent(Conv2dContent &&) = default; + Conv2dContent &operator=(Conv2dContent &&) = default; + bool operator==(const OperatorContent &other) const override; + OperatorComplexity complexity() const override + { + return OperatorComplexity::Complex; + } + void set_method(ConvolutionMethod method) + { + forced_method_enabled = true; + forced_method = method; + } + + Status translate(ClKernelGraph &kernel_graph) const override; + /** Replicate heuristics of @ref ClConv2d::get_convolution_method(), except that non-supported data types and data layouts are removed from the heuristics + * + * @param src + * @param weights + * @param dst + * @param conv2d_desc + * @param gpu_target + * @return ConvolutionMethod + */ + static ConvolutionMethod select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target); + + Conv2dDescriptor desc{}; + ConvolutionMethod forced_method{ ConvolutionMethod::GEMM_CONV2D }; + bool forced_method_enabled{ false }; + +private: + Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const; +}; + +class AddContent : public OperatorContent +{ +public: + AddContent() = default; + AddContent(const OperatorGraph::Implementation *graph, Id id, const AddDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors) + : OperatorContent(graph, id, tensors), desc(desc) + { + } + ~AddContent() = default; + AddContent(const AddContent &) = default; + AddContent &operator=(const AddContent &) = default; + AddContent(AddContent &&) = default; + AddContent &operator=(AddContent &&) = default; + bool operator==(const OperatorContent &other) const override; + OperatorComplexity complexity() const override + { + return OperatorComplexity::Simple; + } + Status translate(ClKernelGraph &kernel_graph) const override; + +private: + AddDescriptor desc{}; +}; + +struct OperatorGraph::Implementation +{ +public: + template <typename ContentT, typename... Args> + void add_node(Operator::Id id, Args &&... args) + { + operators[id] = utils::memory::make_deep_unique<OperatorContent, ContentT>(this, id, std::forward<Args>(args)...); + } + + template <typename... Args> + void add_tensor(OpTensor::Id id, Args &&... args) + { + tensors[id] = utils::memory::make_deep_unique<OpTensorContent, OpTensorContent>(id, std::forward<Args>(args)...); + } + + using Dependency = DependencyGraph; + using OperatorMap = std::map<Operator::Id, utils::memory::deep_unique_ptr<OperatorContent>>; + using OpTensorMap = std::map<OpTensor::Id, utils::memory::deep_unique_ptr<OpTensorContent>>; + + Implementation() = default; + ~Implementation() = default; + + friend bool operator==(const OperatorGraph::Implementation &graph0, const OperatorGraph::Implementation &graph1) + { + return graph0.graph == graph1.graph && graph0.operators == graph1.operators && graph0.tensors == graph1.tensors; + } + + Dependency graph{}; + OperatorMap operators{}; + OpTensorMap tensors{}; + Status status{}; +}; + +std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph); + +std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph); + +Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
\ No newline at end of file |