From 04f4620cf999846a44089c81720aa920edec6993 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Wed, 14 Dec 2022 14:49:56 +0000 Subject: Add multiple output support for dynamic fusion * The dependency graph now can schedule any acyclic graph into a sequential list of operators. This is needed as the output operators now form branches in the graph. * Fix the definition of input, output and intermediate tensors in GpuKernelComponentGroup to support non-linear but sequential list of operators. * Add constraint on GpuOperatorGroup to enforce strictly linear fusion style, but allow output operator as the only form of branch. Resolves: COMPMID-5771 Signed-off-by: Viet-Hoa Do Change-Id: I68de3a31a2456145081f0a397e4e61dd66327682 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8823 Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../sketch/gpu/GpuKernelComponentGraph.cpp | 37 +-- .../sketch/gpu/GpuKernelComponentGroup.cpp | 146 ++++----- .../sketch/gpu/GpuKernelComponentGroup.h | 27 +- .../sketch/gpu/GpuKernelComponentStream.cpp | 2 + src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp | 10 +- src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h | 10 +- .../sketch/gpu/operators/GpuOutput.cpp | 4 +- .../template_writer/cl/ClTemplateActivation.cpp | 2 +- .../gpu/template_writer/cl/ClTemplateCast.cpp | 2 +- .../cl/ClTemplateDepthwiseConv2d.cpp | 2 +- .../template_writer/cl/ClTemplateDirectConv2d.cpp | 2 +- .../cl/ClTemplateElementwiseBinary.cpp | 2 +- .../gpu/template_writer/cl/ClTemplateStore.cpp | 4 +- .../gpu/template_writer/cl/ClTemplateWriter.cpp | 4 +- src/dynamic_fusion/sketch/utils/DependencyGraph.h | 340 +++++++++------------ .../validation/dynamic_fusion/gpu/Integration.cpp | 101 ++++++ 16 files changed, 346 insertions(+), 349 deletions(-) diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp index 1f90aab477..669913ce30 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -93,40 +93,19 @@ GpuKernelComponentStream GpuKernelComponentGraph::fuse() const { // Obtain memory descriptor map const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph); - /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup - /// Fusion can be framed as a mathematical optimization problem: - /// Given fusion constraints, find the "best" fusion patterns possible - /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one - /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end - - /// As the first iteration, we offer a sub-optimal algorithm here which ensures all - /// constraints are met, but provides no guarantee that the fusion pattern is optimal GpuKernelComponentStream stream{ _services, mem_map }; - // Break down into linear groups of components (constraint 1), preserving topological order - const auto linear_graphs = _dependency_graph.topological_partition(); + const auto op_seq = _dependency_graph.build_operators_sequence(); - // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants) - for(const auto &graph : linear_graphs) + stream.new_component_group(); + for(auto op : op_seq) { - for(unsigned int i = 0; i < graph.size(); ++i) - { - const auto comp = _components.at(graph[i].op).get(); - // Each new linear graph signals a new component group in the stream - if(i == 0) - { - stream.new_component_group(); - } - // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group - bool success = stream.add_component(comp); - if(!success) - { - stream.new_component_group(); - success = stream.add_component(comp); - ARM_COMPUTE_ERROR_ON(!success); - } - } + const auto component = _components.at(op.op).get(); + const auto success = stream.add_component(component); + ARM_COMPUTE_ERROR_ON(!success); + ARM_COMPUTE_UNUSED(success); } + return stream; } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp index 3af4c1429d..0d2574957f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -27,6 +27,8 @@ #include "arm_compute/core/Validate.h" #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include + namespace arm_compute { namespace experimental @@ -35,6 +37,9 @@ namespace dynamic_fusion { bool GpuKernelComponentGroup::add_component(ComponentPtr component) { + ARM_COMPUTE_ERROR_ON_MSG( + _finalized, "The component group has been finalized and cannot be altered."); + // note: Constraint 1 is guaranteed as a precondition // Constraint 2 if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) @@ -51,11 +56,6 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) { return false; } - // Constraint 3.3: Disallow multiple output components - if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output) - { - return false; - } // Constraint 4 if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) { @@ -124,55 +124,68 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) return true; } -std::vector GpuKernelComponentGroup::get_src_tensors() const +void GpuKernelComponentGroup::finalize() { - if(_components.empty()) + if(_finalized) { - return {}; + return; } - auto src_tensors = _components[0]->tensors().get_const_src_tensors(); - auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component - for(unsigned int i = 1; i < _components.size(); ++i) + + _finalized = true; + + std::set input_tensors; + std::set output_tensors; + + for(auto component : _components) { - auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors(); - for(const auto src_tensor : cur_src_tensors) + const auto tensors = component->tensors(); + const auto src_tensors = tensors.get_const_src_tensors(); + const auto dst_tensors = tensors.get_const_dst_tensors(); + + // Detect input, output and intermediate tensors. + for(auto tensor : src_tensors) { - if(src_tensor->id() == prev_dst_tensor->id()) + const auto output_tensors_it = output_tensors.find(tensor); + + if(output_tensors_it != output_tensors.end()) { - continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components + // This tensor is the output of another operator. + // It must be marked as intermediate tensor. + output_tensors.erase(output_tensors_it); + _interm_tensors.insert(tensor); + } + else if(_interm_tensors.find(tensor) == _interm_tensors.end()) + { + input_tensors.insert(tensor); } - src_tensors.push_back(src_tensor); } - prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component + + for(auto tensor : dst_tensors) + { + ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end()); + ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end()); + ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end()); + output_tensors.insert(tensor); + } } - return src_tensors; + std::set_union( + input_tensors.begin(), input_tensors.end(), + output_tensors.begin(), output_tensors.end(), + std::back_inserter(_argument_tensors)); + _any_output_tensor = *output_tensors.begin(); } -std::vector GpuKernelComponentGroup::get_dst_tensors() const +const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const { - if(_components.empty()) - { - return {}; - } - const auto dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors(); - std::vector dst_tensors; - for(auto tensor_ptr : dst_tensor_ptrs) - { - dst_tensors.push_back(tensor_ptr); - } - return dst_tensors; + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _any_output_tensor; } std::vector GpuKernelComponentGroup::get_argument_tensors() const { - std::vector arguments; - const auto src_tensors = get_src_tensors(); - const auto dst_tensors = get_dst_tensors(); - arguments.reserve(src_tensors.size() + dst_tensors.size()); - arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end()); - arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end()); - return arguments; + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _argument_tensors; } GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const @@ -184,41 +197,10 @@ GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_componen return _components[0]; } -GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const -{ - if(empty()) - { - return nullptr; - } - return _components[_components.size() - 1]; -} - -GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const -{ - if(empty()) - { - return nullptr; - } - // Get the index of the requested component - size_t ind = 0; - for(const auto c : _components) - { - if(c->id() == id) - { - break; - } - ind++; - } - if(ind == 0 || ind >= _components.size()) - { - return nullptr; - } - return _components[ind - 1]; -} - bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const { - return is_tensor_in(tensor, get_interm_tensors()); + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _interm_tensors.find(tensor) != _interm_tensors.end(); } size_t GpuKernelComponentGroup::size() const @@ -262,30 +244,6 @@ typename std::vector::const_iterator GpuK return _components.cend(); } -std::vector GpuKernelComponentGroup::get_interm_tensors() const -{ - std::vector interm_tensors{}; - for(unsigned int i = 0; i + 1 < _components.size(); ++i) - { - auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; - interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component - } - - return interm_tensors; -} - -bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector tensors) -{ - for(auto t : tensors) - { - if(tensor->id() == t->id()) - { - return true; - } - } - return false; -} - } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h index 4c9d940594..386aefdc05 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -29,6 +29,7 @@ #include #include #include +#include namespace arm_compute { @@ -88,25 +89,16 @@ public: * @return false If the operation fails */ bool add_component(ComponentPtr component); - /** Get source tensors of this group */ - std::vector get_src_tensors() const; - /** Get destination tensors of this group */ - std::vector get_dst_tensors() const; + /** Optimize and pre-compute information about the component group */ + void finalize(); + /** Get one of the destination tensors of this group */ + const ITensorInfo *get_any_dst_tensor() const; /** Get tensor argument of this group * A tensor is an argument if it is a source or destination tensor to the group */ std::vector get_argument_tensors() const; /** Get the root (first) component of this group */ ComponentPtr get_root_component() const; - /** Get the last component of this group */ - ComponentPtr get_last_component() const; - /** Get the previous component to the component with id @p id - * - * @param[in] id Component id of the component whose previous component is of concern - * - * @return ComponentPtr Pointer to the previous component of the one identified by @p id - */ - ComponentPtr get_previous_component(ComponentId id) const; /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group * * An intermediate tensor is any tensor that is not an argument. @@ -131,11 +123,12 @@ public: typename std::vector::const_iterator cend() const; private: - std::vector get_interm_tensors() const; - - static bool is_tensor_in(const ITensorInfo *tensor, const std::vector tensors); - std::vector _components{}; + + bool _finalized{ false }; + std::vector _argument_tensors{}; + std::set _interm_tensors{}; + const ITensorInfo *_any_output_tensor{ nullptr }; }; } // namespace dynamic_fusion } // namespace experimental diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp index aac84b6c59..8f4eadc477 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -44,6 +44,8 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() // Traverse through component groups and assemble workload together for(auto && group : _component_groups) { + group.finalize(); + // Write kernel code GpuLogicalKernel logical_kernel(_services, group); const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code(); diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp index e8ef835405..7bb14c8698 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -68,12 +68,12 @@ ArgumentPack Operator::tensors() const return _tensors; } -bool GpuOperatorGroup::try_add_operator(const Operator &op) const +bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const { const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); // Constraint 1 - if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids)) + if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) { return false; } @@ -143,12 +143,12 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op) const } return true; } -void GpuOperatorGroup::add_operator(const Operator &op) +void GpuOperatorGroup::add_operator(const Operator &op, bool is_output) { - ARM_COMPUTE_ERROR_ON(!try_add_operator(op)); + ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output)); const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); - _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids); + _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output); _operators[op.id()] = op; } Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack &tensors) const diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h index 35abe6c543..308a9d796a 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -77,17 +77,19 @@ public: static constexpr size_t max_fused_operators = 32; /** Try adding (without actually adding) an operator to the group * - * @param[in] op Operator to be added + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. * * @return true If @p op can be added while maintaining the invariants * @return false Otherwise */ - bool try_add_operator(const Operator &op) const; + bool try_add_operator(const Operator &op, bool is_output = false) const; /** Add an operator to the group * - * @param[in] op Operator to be added + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. */ - void add_operator(const Operator &op); + void add_operator(const Operator &op, bool is_output = false); /** Create a new operator * * @param[in] operator_type @ref GpuOperatorType of the new operator diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp index 017536df6c..60c2281433 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp @@ -81,7 +81,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const auto group = sketch.implementation().operator_group(); const auto op = group.new_operator(operator_type, tensors); - const auto success = group.try_add_operator(op); + const auto success = group.try_add_operator(op, true); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload."); ARM_COMPUTE_UNUSED(success); @@ -133,7 +133,7 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch, tensors.add_const_tensor(ACL_DST_0, dst); const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); - sketch.implementation().operator_group().add_operator(op); + sketch.implementation().operator_group().add_operator(op, true); } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp index c3128ea552..8adf056912 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp @@ -125,7 +125,7 @@ TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, c lut["src"] = vtable.get_variable(_src); lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp index 1ac49406a8..6ab3a68bb0 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp @@ -137,7 +137,7 @@ TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const C lut["src"] = vtable.get_variable(_src); lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp index 389bd5c65f..6fa77aafe3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -251,7 +251,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab } lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index aa324ffb54..221addb7b5 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -268,7 +268,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, } lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index 6c1e0fb1de..39cec6e31c 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -194,7 +194,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["lhs"] = vtable.get_variable(_lhs); lut["rhs"] = vtable.get_variable(_rhs); lut["dst"] = vtable.get_variable(_dst); - lut["out"] = vtable.get_variable(comp_group.get_dst_tensors().front()); + lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor()); } else { diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index e4b662b3a8..ef4f2f22a1 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -84,9 +84,9 @@ TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const // Local build options lut["meta_kernel_id"] = id(); lut["DST_TENSOR_TYPE"] = "BUFFER"; - const auto dst_info = comp_group.get_dst_tensors()[0]; - lut["DST_DATA_TYPE"] = dst_info->data_type(); + lut["DST_DATA_TYPE"] = _dst->data_type(); + ARM_COMPUTE_UNUSED(comp_group); return lut; } diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp index 0afd0e7581..eed481f109 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -203,9 +203,7 @@ std::string ClTemplateWriter::write_code() } std::string ClTemplateWriter::write_global_section() const { - const auto dst_tensors = _components.get_dst_tensors(); - ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed"); - const auto dst_info = dst_tensors[0]; + const auto dst_info = _components.get_any_dst_tensor(); const auto dst_w = dst_info->dimension(0); const auto tile_w = std::max(1, get_window().x().step()); const auto tile_h = std::max(1, get_window().y().step()); diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h index 633c5e4263..c891e76d8b 100644 --- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h +++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h @@ -25,9 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH #include "arm_compute/core/Error.h" -#include #include -#include #include #include #include @@ -84,40 +82,84 @@ public: friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &); /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list - * @note The list is expected to only grow from head to tail + * + * Rule: If the new operator is not the first operator, at least one input tensor must be + * the output tensor of the last non-output operator. All other input tensors must be + * the global input of the graph (i.e. not the output of any operator). + * + * Rule: The output tensor of the new operator must not be the input tensor of any previously + * added operator. * * PRECONDITION: The current graph is already linear * * @return true If the operator can be added while keeping the graph as a linear sequence * @return false Otherwise */ - bool try_add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs) const + bool try_add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) const { - ARM_COMPUTE_UNUSED(op, outputs); + ARM_COMPUTE_UNUSED(op, is_output); if(all_ops().empty()) { return true; } - std::vector common_tensors{}; - auto existing_tensors = all_tensors(); - std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted - std::vector sorted_inputs{ inputs }; - std::sort(sorted_inputs.begin(), sorted_inputs.end()); - std::set_intersection(existing_tensors.begin(), existing_tensors.end(), - sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors)); - if(common_tensors.size() != 1U) + + // If the new operator is not the first operator, at least one input tensor must be + // the output tensor of the last non-output operator. All other input tensors must be + // the global input of the graph (i.e. not the output of any operator). + if(_last_op_available) { - return false; + auto use_input_from_last_op = false; + + for(auto src_tensor : inputs) + { + const auto src_ops = _adj_src_ops.find(src_tensor); + + if(src_ops != _adj_src_ops.end()) + { + ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1); + + if(!src_ops->second.empty()) + { + const auto src_op = src_ops->second[0]; + + if(src_op == _last_op) + { + if(use_input_from_last_op) + { + // To be safe, we also forbid using the output tensor + // of the last operator twice. + return false; + } + + use_input_from_last_op = true; + } + else + { + // The input tensor of this operator must not be the output tensor + // of any other operator except the last non-output operator. + return false; + } + } + } + } + + if(!use_input_from_last_op) + { + // At least one input tensor must be the output tensor of the last non-output operator. + return false; + } } - const auto linked_tensor = common_tensors[0]; - const auto tail_ops = get_dst_ops(); - ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION - const auto tail = tail_ops[0]; - if(!is_in(linked_tensor, dst_tensors(tail))) + // The output tensor of the new operator must not be the input tensor of any previously + // added operator. + for(auto dst_tensor : outputs) { - return false; + if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) + { + return false; + } } + return true; } /** Add an operator, while keeping the graph as a "linear sequence" @@ -126,10 +168,9 @@ public: * INVARIANT: The list can only grow from head to tail * INVARIANT: POSTCONDITION: The graph is linear */ - void add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs) + void add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) { - ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs)); - auto success = add_operator(op, inputs, outputs); + const auto success = add_operator(op, inputs, outputs, is_output); ARM_COMPUTE_UNUSED(success); ARM_COMPUTE_ERROR_ON(!success); } @@ -137,11 +178,12 @@ public: * Return invalid if it violates the DAG invariant * Invalid operation will not change the graph * - * @param[in] op Operator to add - * @param[in] inputs Input tensors to the operator - * @param[in] outputs Output tensors to the operator + * @param[in] op Operator to add + * @param[in] inputs Input tensors to the operator + * @param[in] outputs Output tensors to the operator + * @param[in] is_output Whether this is an output operator */ - bool add_operator(OperatorId op, const std::vector &inputs, const std::vector &outputs) + bool add_operator(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) { if(operator_exists(op)) { @@ -169,182 +211,41 @@ public: } } - return true; - } - - /** Sort the graph in a topological order - * - * @return std::vector - */ - std::vector topological_sort() const - { - // Incident degree (number of source operators to an op) - std::map in_degree{}; - std::set visited_ops{}; - std::deque zero_in_degree_ops{}; - std::vector sorted_op_packs{}; - for(auto op : all_ops()) - { - const auto degree = src_ops(op).size(); - in_degree[op] = degree; - if(degree == 0) - { - zero_in_degree_ops.push_back(op); - visited_ops.insert(op); - } - } - - while(!zero_in_degree_ops.empty()) + if(!is_output) { - const OperatorId op = zero_in_degree_ops.front(); - zero_in_degree_ops.pop_front(); - sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); - - for(const auto next_op : dst_ops(op)) - { - if(in_degree[next_op] > 0) - { - in_degree[next_op]--; - } - if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end()) - { - zero_in_degree_ops.push_back(next_op); - visited_ops.insert(op); - } - } + _last_op_available = true; + _last_op = op; } - return sorted_op_packs; + return true; } - void find_independent_paths_util(OperatorId op, std::vector> &paths, std::vector cur_path, - const std::map &in_degree) const - { - // We have found an unresolved dependency - if(in_degree.at(op) > 1) - { - paths.push_back(cur_path); - return; - } - const auto child_ops = dst_ops(op); - - cur_path.push_back(op); - // Hit the leaf op - if(child_ops.empty()) - { - paths.push_back(cur_path); - return; - } - for(const auto child_op : child_ops) - { - find_independent_paths_util(child_op, paths, cur_path, in_degree); - } - } - /** Find all independent linear paths from op, which doesn't depend on any other op + /** Build a sequence of operators from the acyclic graph of operators. * - * @return std::vector + * The graph will be visited in depth-first strategy. The operator can only be added to + * the sequence when all operators that supply the input tensors have been added. Otherwise, + * the operator will be ignored and later visited again. In other words, the dependency between + * operators will be preserved in the sequence. */ - std::vector> find_independent_paths(OperatorId op, - const std::map &in_degree) const + std::vector build_operators_sequence() const { - std::vector> paths; - std::vector cur_path; - find_independent_paths_util(op, paths, cur_path, in_degree); - return paths; - } - /** Find a longest linear path from op, which doesn't depend on any other op - * - * @return std::vector - */ - std::vector find_longest_independent_path(OperatorId op, - const std::map &in_degree) const - { - const auto &paths = find_independent_paths(op, in_degree); - ARM_COMPUTE_ERROR_ON(paths.empty()); - size_t max_len = 0; - const std::vector *max_path = nullptr; + std::vector ops_seq; + std::set done_ops; + std::set done_tensors; - for(const auto &path : paths) - { - if(path.size() >= max_len) - { - max_path = &path; - max_len = path.size(); - } - } - return *max_path; - } - std::vector propose_next_path(std::set &candidate_ops, - const std::map &in_degree) const - { - if(candidate_ops.empty()) - { - return {}; - } - size_t max_len = 0; - std::vector max_path; - OperatorId chosen_op{}; - for(auto op : candidate_ops) - { - const auto path = find_longest_independent_path(op, in_degree); - if(path.size() >= max_len) - { - chosen_op = op; - max_path = path; - max_len = path.size(); - } - } - candidate_ops.erase(chosen_op); - return max_path; - } - /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize - * the number of partitions - */ - std::vector> topological_partition() const - { - // Initialize zero incident degree and zero in degree ops - std::map in_degree{}; - std::set candidate_ops{}; - for(auto op : all_ops()) - { - const auto degree = src_ops(op).size(); - in_degree[op] = degree; - if(degree == 0) - { - candidate_ops.insert(op); - } - } + const auto input_tensors = global_src_tensors(); - std::vector> sorted_partitions{}; - while(!candidate_ops.empty()) + for(auto tensor : input_tensors) { - // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops) - const auto path = propose_next_path(candidate_ops, in_degree); + done_tensors.insert(tensor); - // Append to sorted_partitions - std::vector path_op_pack{}; - for(auto op : path) + for(auto op : _adj_dst_ops.at(tensor)) { - path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); - } - sorted_partitions.push_back(path_op_pack); - // Remove whole path (Update in_degree, visited_ops, candidate_ops) - for(auto op : path) - { - for(const auto next_op_child : dst_ops(op)) - { - if(in_degree[next_op_child] > 0) - { - in_degree[next_op_child]--; - } - if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates - { - candidate_ops.insert(next_op_child); - } - } + build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors); } } - return sorted_partitions; + + return ops_seq; } /** Strict equality comparison (all internal ids and order of insertion matter). @@ -673,11 +574,74 @@ private: return false; } + void build_operators_sequence_from_op( + Id op, + std::vector &ops_seq, + std::set &done_ops, + std::set &done_tensors) const + { + while(true) + { + // If the operator has been added to the sequence, ignore it. + if(done_ops.find(op) != done_ops.end()) + { + return; + } + + // If not all the input tensors of the operator are available, this operator cannot be + // added to the sequence for now. It will be visited again after the source operator + // is added to the sequence. + const auto src_tensors = _adj_src_tensors.at(op); + + for(auto src : src_tensors) + { + if(done_tensors.find(src) == done_tensors.end()) + { + return; + } + } + + // This operator is ready to be added to the sequence. + const auto dst_tensors = _adj_dst_tensors.at(op); + + done_ops.insert(op); + + OpPack pack{ op, src_tensors, dst_tensors }; + ops_seq.push_back(pack); + + done_tensors.insert(dst_tensors.begin(), dst_tensors.end()); + + // Visit all the sink operators. + // Call this function recursively unless there is only one sink. + if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) + { + op = _adj_dst_ops.at(dst_tensors[0])[0]; + } + else + { + for(auto dst_tensor : dst_tensors) + { + const auto dst_ops = _adj_dst_ops.at(dst_tensor); + + for(auto dst_op : dst_ops) + { + build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors); + } + } + + return; + } + } + } + private: AdjList _adj_src_tensors{}; AdjList _adj_dst_tensors{}; AdjList _adj_src_ops{}; AdjList _adj_dst_ops{}; + + bool _last_op_available{ false }; + OperatorId _last_op{ 0 }; }; } // namespace dynamic_fusion diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp index 58d2215e64..a5716ce1e1 100644 --- a/tests/validation/dynamic_fusion/gpu/Integration.cpp +++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp @@ -28,6 +28,7 @@ #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h" #include "tests/CL/CLAccessor.h" @@ -36,6 +37,7 @@ #include "tests/validation/dynamic_fusion/Utils.h" #include "tests/validation/reference/ConvolutionLayer.h" #include "tests/validation/reference/Permute.h" +#include "tests/validation/reference/ElementwiseOperations.h" using namespace arm_compute::experimental::dynamic_fusion; using namespace arm_compute::test::validation::utils; @@ -137,6 +139,105 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL) RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); } +TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL) +{ + /* Computation: + * out_0 = in_0 + in_1 + * out_1 = out_0 + in_2 + */ + CLScheduler::get().default_reinit(); + + const auto data_type = DataType::F32; + const auto t_input_shape = TensorShape(8, 2, 1); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type); + + auto out_0_info = sketch.create_tensor_info(); + auto out_1_info = sketch.create_tensor_info(); + + auto ans_0_info = sketch.create_tensor_info(); + auto ans_1_info = sketch.create_tensor_info(); + + GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info); + GpuOutput::create_op(sketch, &ans_0_info, &out_0_info); + GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info); + GpuOutput::create_op(sketch, &ans_1_info, &out_1_info); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + // Instead of using ACL allocated memory, the user can choose to import memory into the tensors + for(auto &data : runtime.get_auxiliary_tensors()) + { + CLTensor *tensor = data.first; + AuxMemoryInfo aux_mem_req = data.second; + tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment); + tensor->allocator()->allocate(); // Use ACL allocated memory + // auto buf = cl::Buffer(); + // tensor->allocator()->import_memory(buf); // Or, import external memory + } + + // Construct user tensors + CLTensor t_in_0{}; + CLTensor t_in_1{}; + CLTensor t_in_2{}; + + CLTensor t_out_0{}; + CLTensor t_out_1{}; + + // Initialize user tensors + t_in_0.allocator()->init(in_0_info); + t_in_1.allocator()->init(in_1_info); + t_in_2.allocator()->init(in_2_info); + + t_out_0.allocator()->init(out_0_info); + t_out_1.allocator()->init(out_1_info); + + // Allocate and fill user tensors + // Instead of using ACL allocator, the user can choose to import memory into the tensors + t_in_0.allocator()->allocate(); + t_in_1.allocator()->allocate(); + t_in_2.allocator()->allocate(); + + t_out_0.allocator()->allocate(); + t_out_1.allocator()->allocate(); + + fill(CLAccessor(t_in_0), 0, library.get()); + fill(CLAccessor(t_in_1), 1, library.get()); + fill(CLAccessor(t_in_2), 2, library.get()); + + // Run runtime + runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 }); + + // Create reference + SimpleTensor ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() }; + + SimpleTensor ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() }; + SimpleTensor ref_t_out_1{ t_input_shape, data_type, 1, QuantizationInfo() }; + + // Fill reference + fill(ref_t_in_0, 0, library.get()); + fill(ref_t_in_1, 1, library.get()); + fill(ref_t_in_2, 2, library.get()); + + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP); + reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1, ConvertPolicy::WRAP); + + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ + validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32); + validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32); +} TEST_SUITE(Invalid_Fusion_Should_Fail) TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) { -- cgit v1.2.1