diff options
Diffstat (limited to 'src/dynamic_fusion/sketch')
15 files changed, 245 insertions, 349 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp index 1f90aab477..669913ce30 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -93,40 +93,19 @@ GpuKernelComponentStream GpuKernelComponentGraph::fuse() const { // Obtain memory descriptor map const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph); - /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup - /// Fusion can be framed as a mathematical optimization problem: - /// Given fusion constraints, find the "best" fusion patterns possible - /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one - /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end - - /// As the first iteration, we offer a sub-optimal algorithm here which ensures all - /// constraints are met, but provides no guarantee that the fusion pattern is optimal GpuKernelComponentStream stream{ _services, mem_map }; - // Break down into linear groups of components (constraint 1), preserving topological order - const auto linear_graphs = _dependency_graph.topological_partition(); + const auto op_seq = _dependency_graph.build_operators_sequence(); - // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants) - for(const auto &graph : linear_graphs) + stream.new_component_group(); + for(auto op : op_seq) { - for(unsigned int i = 0; i < graph.size(); ++i) - { - const auto comp = _components.at(graph[i].op).get(); - // Each new linear graph signals a new component group in the stream - if(i == 0) - { - stream.new_component_group(); - } - // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group - bool success = stream.add_component(comp); - if(!success) - { - stream.new_component_group(); - success = stream.add_component(comp); - ARM_COMPUTE_ERROR_ON(!success); - } - } + const auto component = _components.at(op.op).get(); + const auto success = stream.add_component(component); + ARM_COMPUTE_ERROR_ON(!success); + ARM_COMPUTE_UNUSED(success); } + return stream; } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp index 3af4c1429d..0d2574957f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -27,6 +27,8 @@ #include "arm_compute/core/Validate.h" #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include <algorithm> + namespace arm_compute { namespace experimental @@ -35,6 +37,9 @@ namespace dynamic_fusion { bool GpuKernelComponentGroup::add_component(ComponentPtr component) { + ARM_COMPUTE_ERROR_ON_MSG( + _finalized, "The component group has been finalized and cannot be altered."); + // note: Constraint 1 is guaranteed as a precondition // Constraint 2 if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) @@ -51,11 +56,6 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) { return false; } - // Constraint 3.3: Disallow multiple output components - if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output) - { - return false; - } // Constraint 4 if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) { @@ -124,55 +124,68 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) return true; } -std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const +void GpuKernelComponentGroup::finalize() { - if(_components.empty()) + if(_finalized) { - return {}; + return; } - auto src_tensors = _components[0]->tensors().get_const_src_tensors(); - auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component - for(unsigned int i = 1; i < _components.size(); ++i) + + _finalized = true; + + std::set<const ITensorInfo *> input_tensors; + std::set<const ITensorInfo *> output_tensors; + + for(auto component : _components) { - auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors(); - for(const auto src_tensor : cur_src_tensors) + const auto tensors = component->tensors(); + const auto src_tensors = tensors.get_const_src_tensors(); + const auto dst_tensors = tensors.get_const_dst_tensors(); + + // Detect input, output and intermediate tensors. + for(auto tensor : src_tensors) { - if(src_tensor->id() == prev_dst_tensor->id()) + const auto output_tensors_it = output_tensors.find(tensor); + + if(output_tensors_it != output_tensors.end()) { - continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components + // This tensor is the output of another operator. + // It must be marked as intermediate tensor. + output_tensors.erase(output_tensors_it); + _interm_tensors.insert(tensor); + } + else if(_interm_tensors.find(tensor) == _interm_tensors.end()) + { + input_tensors.insert(tensor); } - src_tensors.push_back(src_tensor); } - prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component + + for(auto tensor : dst_tensors) + { + ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end()); + ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end()); + ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end()); + output_tensors.insert(tensor); + } } - return src_tensors; + std::set_union( + input_tensors.begin(), input_tensors.end(), + output_tensors.begin(), output_tensors.end(), + std::back_inserter(_argument_tensors)); + _any_output_tensor = *output_tensors.begin(); } -std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const +const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const { - if(_components.empty()) - { - return {}; - } - const auto dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors(); - std::vector<const ITensorInfo *> dst_tensors; - for(auto tensor_ptr : dst_tensor_ptrs) - { - dst_tensors.push_back(tensor_ptr); - } - return dst_tensors; + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _any_output_tensor; } std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const { - std::vector<const ITensorInfo *> arguments; - const auto src_tensors = get_src_tensors(); - const auto dst_tensors = get_dst_tensors(); - arguments.reserve(src_tensors.size() + dst_tensors.size()); - arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end()); - arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end()); - return arguments; + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _argument_tensors; } GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const @@ -184,41 +197,10 @@ GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_componen return _components[0]; } -GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const -{ - if(empty()) - { - return nullptr; - } - return _components[_components.size() - 1]; -} - -GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const -{ - if(empty()) - { - return nullptr; - } - // Get the index of the requested component - size_t ind = 0; - for(const auto c : _components) - { - if(c->id() == id) - { - break; - } - ind++; - } - if(ind == 0 || ind >= _components.size()) - { - return nullptr; - } - return _components[ind - 1]; -} - bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const { - return is_tensor_in(tensor, get_interm_tensors()); + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _interm_tensors.find(tensor) != _interm_tensors.end(); } size_t GpuKernelComponentGroup::size() const @@ -262,30 +244,6 @@ typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuK return _components.cend(); } -std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const -{ - std::vector<const ITensorInfo *> interm_tensors{}; - for(unsigned int i = 0; i + 1 < _components.size(); ++i) - { - auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; - interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component - } - - return interm_tensors; -} - -bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors) -{ - for(auto t : tensors) - { - if(tensor->id() == t->id()) - { - return true; - } - } - return false; -} - } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h index 4c9d940594..386aefdc05 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -29,6 +29,7 @@ #include <cstdint> #include <cstdlib> #include <vector> +#include <set> namespace arm_compute { @@ -88,25 +89,16 @@ public: * @return false If the operation fails */ bool add_component(ComponentPtr component); - /** Get source tensors of this group */ - std::vector<const ITensorInfo *> get_src_tensors() const; - /** Get destination tensors of this group */ - std::vector<const ITensorInfo *> get_dst_tensors() const; + /** Optimize and pre-compute information about the component group */ + void finalize(); + /** Get one of the destination tensors of this group */ + const ITensorInfo *get_any_dst_tensor() const; /** Get tensor argument of this group * A tensor is an argument if it is a source or destination tensor to the group */ std::vector<const ITensorInfo *> get_argument_tensors() const; /** Get the root (first) component of this group */ ComponentPtr get_root_component() const; - /** Get the last component of this group */ - ComponentPtr get_last_component() const; - /** Get the previous component to the component with id @p id - * - * @param[in] id Component id of the component whose previous component is of concern - * - * @return ComponentPtr Pointer to the previous component of the one identified by @p id - */ - ComponentPtr get_previous_component(ComponentId id) const; /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group * * An intermediate tensor is any tensor that is not an argument. @@ -131,11 +123,12 @@ public: typename std::vector<ComponentPtr>::const_iterator cend() const; private: - std::vector<const ITensorInfo *> get_interm_tensors() const; - - static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors); - std::vector<ComponentPtr> _components{}; + + bool _finalized{ false }; + std::vector<const ITensorInfo *> _argument_tensors{}; + std::set<const ITensorInfo *> _interm_tensors{}; + const ITensorInfo *_any_output_tensor{ nullptr }; }; } // namespace dynamic_fusion } // namespace experimental diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp index aac84b6c59..8f4eadc477 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -44,6 +44,8 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() // Traverse through component groups and assemble workload together for(auto && group : _component_groups) { + group.finalize(); + // Write kernel code GpuLogicalKernel logical_kernel(_services, group); const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code(); diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp index e8ef835405..7bb14c8698 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -68,12 +68,12 @@ ArgumentPack<ITensorInfo> Operator::tensors() const return _tensors; } -bool GpuOperatorGroup::try_add_operator(const Operator &op) const +bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const { const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); // Constraint 1 - if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids)) + if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) { return false; } @@ -143,12 +143,12 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op) const } return true; } -void GpuOperatorGroup::add_operator(const Operator &op) +void GpuOperatorGroup::add_operator(const Operator &op, bool is_output) { - ARM_COMPUTE_ERROR_ON(!try_add_operator(op)); + ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output)); const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); - _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids); + _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output); _operators[op.id()] = op; } Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h index 35abe6c543..308a9d796a 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -77,17 +77,19 @@ public: static constexpr size_t max_fused_operators = 32; /** Try adding (without actually adding) an operator to the group * - * @param[in] op Operator to be added + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. * * @return true If @p op can be added while maintaining the invariants * @return false Otherwise */ - bool try_add_operator(const Operator &op) const; + bool try_add_operator(const Operator &op, bool is_output = false) const; /** Add an operator to the group * - * @param[in] op Operator to be added + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. */ - void add_operator(const Operator &op); + void add_operator(const Operator &op, bool is_output = false); /** Create a new operator * * @param[in] operator_type @ref GpuOperatorType of the new operator diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp index 017536df6c..60c2281433 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp @@ -81,7 +81,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const auto group = sketch.implementation().operator_group(); const auto op = group.new_operator(operator_type, tensors); - const auto success = group.try_add_operator(op); + const auto success = group.try_add_operator(op, true); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload."); ARM_COMPUTE_UNUSED(success); @@ -133,7 +133,7 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch, tensors.add_const_tensor(ACL_DST_0, dst); const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); - sketch.implementation().operator_group().add_operator(op); + sketch.implementation().operator_group().add_operator(op, true); } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp index c3128ea552..8adf056912 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp @@ -125,7 +125,7 @@ TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, c lut["src"] = vtable.get_variable(_src); lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp index 1ac49406a8..6ab3a68bb0 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp @@ -137,7 +137,7 @@ TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const C lut["src"] = vtable.get_variable(_src); lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp index 389bd5c65f..6fa77aafe3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -251,7 +251,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab } lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index aa324ffb54..221addb7b5 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -268,7 +268,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, } lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index 6c1e0fb1de..39cec6e31c 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -194,7 +194,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["lhs"] = vtable.get_variable(_lhs); lut["rhs"] = vtable.get_variable(_rhs); lut["dst"] = vtable.get_variable(_dst); - lut["out"] = vtable.get_variable(comp_group.get_dst_tensors().front()); + lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor()); } else { diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index e4b662b3a8..ef4f2f22a1 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -84,9 +84,9 @@ TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const // Local build options lut["meta_kernel_id"] = id(); lut["DST_TENSOR_TYPE"] = "BUFFER"; - const auto dst_info = comp_group.get_dst_tensors()[0]; - lut["DST_DATA_TYPE"] = dst_info->data_type(); + lut["DST_DATA_TYPE"] = _dst->data_type(); + ARM_COMPUTE_UNUSED(comp_group); return lut; } diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp index 0afd0e7581..eed481f109 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -203,9 +203,7 @@ std::string ClTemplateWriter::write_code() } std::string ClTemplateWriter::write_global_section() const { - const auto dst_tensors = _components.get_dst_tensors(); - ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed"); - const auto dst_info = dst_tensors[0]; + const auto dst_info = _components.get_any_dst_tensor(); const auto dst_w = dst_info->dimension(0); const auto tile_w = std::max(1, get_window().x().step()); const auto tile_h = std::max(1, get_window().y().step()); diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h index 633c5e4263..c891e76d8b 100644 --- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h +++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h @@ -25,9 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH #include "arm_compute/core/Error.h" -#include <algorithm> #include <cstdint> -#include <deque> #include <map> #include <set> #include <tuple> @@ -84,40 +82,84 @@ public: friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &); /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list - * @note The list is expected to only grow from head to tail + * + * Rule: If the new operator is not the first operator, at least one input tensor must be + * the output tensor of the last non-output operator. All other input tensors must be + * the global input of the graph (i.e. not the output of any operator). + * + * Rule: The output tensor of the new operator must not be the input tensor of any previously + * added operator. * * PRECONDITION: The current graph is already linear * * @return true If the operator can be added while keeping the graph as a linear sequence * @return false Otherwise */ - bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) const + bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) const { - ARM_COMPUTE_UNUSED(op, outputs); + ARM_COMPUTE_UNUSED(op, is_output); if(all_ops().empty()) { return true; } - std::vector<TensorId> common_tensors{}; - auto existing_tensors = all_tensors(); - std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted - std::vector<TensorId> sorted_inputs{ inputs }; - std::sort(sorted_inputs.begin(), sorted_inputs.end()); - std::set_intersection(existing_tensors.begin(), existing_tensors.end(), - sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors)); - if(common_tensors.size() != 1U) + + // If the new operator is not the first operator, at least one input tensor must be + // the output tensor of the last non-output operator. All other input tensors must be + // the global input of the graph (i.e. not the output of any operator). + if(_last_op_available) { - return false; + auto use_input_from_last_op = false; + + for(auto src_tensor : inputs) + { + const auto src_ops = _adj_src_ops.find(src_tensor); + + if(src_ops != _adj_src_ops.end()) + { + ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1); + + if(!src_ops->second.empty()) + { + const auto src_op = src_ops->second[0]; + + if(src_op == _last_op) + { + if(use_input_from_last_op) + { + // To be safe, we also forbid using the output tensor + // of the last operator twice. + return false; + } + + use_input_from_last_op = true; + } + else + { + // The input tensor of this operator must not be the output tensor + // of any other operator except the last non-output operator. + return false; + } + } + } + } + + if(!use_input_from_last_op) + { + // At least one input tensor must be the output tensor of the last non-output operator. + return false; + } } - const auto linked_tensor = common_tensors[0]; - const auto tail_ops = get_dst_ops(); - ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION - const auto tail = tail_ops[0]; - if(!is_in(linked_tensor, dst_tensors(tail))) + // The output tensor of the new operator must not be the input tensor of any previously + // added operator. + for(auto dst_tensor : outputs) { - return false; + if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) + { + return false; + } } + return true; } /** Add an operator, while keeping the graph as a "linear sequence" @@ -126,10 +168,9 @@ public: * INVARIANT: The list can only grow from head to tail * INVARIANT: POSTCONDITION: The graph is linear */ - void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) + void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) { - ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs)); - auto success = add_operator(op, inputs, outputs); + const auto success = add_operator(op, inputs, outputs, is_output); ARM_COMPUTE_UNUSED(success); ARM_COMPUTE_ERROR_ON(!success); } @@ -137,11 +178,12 @@ public: * Return invalid if it violates the DAG invariant * Invalid operation will not change the graph * - * @param[in] op Operator to add - * @param[in] inputs Input tensors to the operator - * @param[in] outputs Output tensors to the operator + * @param[in] op Operator to add + * @param[in] inputs Input tensors to the operator + * @param[in] outputs Output tensors to the operator + * @param[in] is_output Whether this is an output operator */ - bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) + bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) { if(operator_exists(op)) { @@ -169,182 +211,41 @@ public: } } - return true; - } - - /** Sort the graph in a topological order - * - * @return std::vector<OpPack> - */ - std::vector<OpPack> topological_sort() const - { - // Incident degree (number of source operators to an op) - std::map<OperatorId, unsigned int> in_degree{}; - std::set<OperatorId> visited_ops{}; - std::deque<OperatorId> zero_in_degree_ops{}; - std::vector<OpPack> sorted_op_packs{}; - for(auto op : all_ops()) - { - const auto degree = src_ops(op).size(); - in_degree[op] = degree; - if(degree == 0) - { - zero_in_degree_ops.push_back(op); - visited_ops.insert(op); - } - } - - while(!zero_in_degree_ops.empty()) + if(!is_output) { - const OperatorId op = zero_in_degree_ops.front(); - zero_in_degree_ops.pop_front(); - sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); - - for(const auto next_op : dst_ops(op)) - { - if(in_degree[next_op] > 0) - { - in_degree[next_op]--; - } - if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end()) - { - zero_in_degree_ops.push_back(next_op); - visited_ops.insert(op); - } - } + _last_op_available = true; + _last_op = op; } - return sorted_op_packs; + return true; } - void find_independent_paths_util(OperatorId op, std::vector<std::vector<OperatorId>> &paths, std::vector<OperatorId> cur_path, - const std::map<OperatorId, unsigned int> &in_degree) const - { - // We have found an unresolved dependency - if(in_degree.at(op) > 1) - { - paths.push_back(cur_path); - return; - } - const auto child_ops = dst_ops(op); - - cur_path.push_back(op); - // Hit the leaf op - if(child_ops.empty()) - { - paths.push_back(cur_path); - return; - } - for(const auto child_op : child_ops) - { - find_independent_paths_util(child_op, paths, cur_path, in_degree); - } - } - /** Find all independent linear paths from op, which doesn't depend on any other op + /** Build a sequence of operators from the acyclic graph of operators. * - * @return std::vector<OpPack> + * The graph will be visited in depth-first strategy. The operator can only be added to + * the sequence when all operators that supply the input tensors have been added. Otherwise, + * the operator will be ignored and later visited again. In other words, the dependency between + * operators will be preserved in the sequence. */ - std::vector<std::vector<OperatorId>> find_independent_paths(OperatorId op, - const std::map<OperatorId, unsigned int> &in_degree) const + std::vector<OpPack> build_operators_sequence() const { - std::vector<std::vector<OperatorId>> paths; - std::vector<OperatorId> cur_path; - find_independent_paths_util(op, paths, cur_path, in_degree); - return paths; - } - /** Find a longest linear path from op, which doesn't depend on any other op - * - * @return std::vector<OpPack> - */ - std::vector<OperatorId> find_longest_independent_path(OperatorId op, - const std::map<OperatorId, unsigned int> &in_degree) const - { - const auto &paths = find_independent_paths(op, in_degree); - ARM_COMPUTE_ERROR_ON(paths.empty()); - size_t max_len = 0; - const std::vector<OperatorId> *max_path = nullptr; + std::vector<OpPack> ops_seq; + std::set<Id> done_ops; + std::set<Id> done_tensors; - for(const auto &path : paths) - { - if(path.size() >= max_len) - { - max_path = &path; - max_len = path.size(); - } - } - return *max_path; - } - std::vector<OperatorId> propose_next_path(std::set<OperatorId> &candidate_ops, - const std::map<OperatorId, unsigned int> &in_degree) const - { - if(candidate_ops.empty()) - { - return {}; - } - size_t max_len = 0; - std::vector<OperatorId> max_path; - OperatorId chosen_op{}; - for(auto op : candidate_ops) - { - const auto path = find_longest_independent_path(op, in_degree); - if(path.size() >= max_len) - { - chosen_op = op; - max_path = path; - max_len = path.size(); - } - } - candidate_ops.erase(chosen_op); - return max_path; - } - /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize - * the number of partitions - */ - std::vector<std::vector<OpPack>> topological_partition() const - { - // Initialize zero incident degree and zero in degree ops - std::map<OperatorId, unsigned int> in_degree{}; - std::set<OperatorId> candidate_ops{}; - for(auto op : all_ops()) - { - const auto degree = src_ops(op).size(); - in_degree[op] = degree; - if(degree == 0) - { - candidate_ops.insert(op); - } - } + const auto input_tensors = global_src_tensors(); - std::vector<std::vector<OpPack>> sorted_partitions{}; - while(!candidate_ops.empty()) + for(auto tensor : input_tensors) { - // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops) - const auto path = propose_next_path(candidate_ops, in_degree); + done_tensors.insert(tensor); - // Append to sorted_partitions - std::vector<OpPack> path_op_pack{}; - for(auto op : path) + for(auto op : _adj_dst_ops.at(tensor)) { - path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); - } - sorted_partitions.push_back(path_op_pack); - // Remove whole path (Update in_degree, visited_ops, candidate_ops) - for(auto op : path) - { - for(const auto next_op_child : dst_ops(op)) - { - if(in_degree[next_op_child] > 0) - { - in_degree[next_op_child]--; - } - if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates - { - candidate_ops.insert(next_op_child); - } - } + build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors); } } - return sorted_partitions; + + return ops_seq; } /** Strict equality comparison (all internal ids and order of insertion matter). @@ -673,11 +574,74 @@ private: return false; } + void build_operators_sequence_from_op( + Id op, + std::vector<OpPack> &ops_seq, + std::set<Id> &done_ops, + std::set<Id> &done_tensors) const + { + while(true) + { + // If the operator has been added to the sequence, ignore it. + if(done_ops.find(op) != done_ops.end()) + { + return; + } + + // If not all the input tensors of the operator are available, this operator cannot be + // added to the sequence for now. It will be visited again after the source operator + // is added to the sequence. + const auto src_tensors = _adj_src_tensors.at(op); + + for(auto src : src_tensors) + { + if(done_tensors.find(src) == done_tensors.end()) + { + return; + } + } + + // This operator is ready to be added to the sequence. + const auto dst_tensors = _adj_dst_tensors.at(op); + + done_ops.insert(op); + + OpPack pack{ op, src_tensors, dst_tensors }; + ops_seq.push_back(pack); + + done_tensors.insert(dst_tensors.begin(), dst_tensors.end()); + + // Visit all the sink operators. + // Call this function recursively unless there is only one sink. + if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) + { + op = _adj_dst_ops.at(dst_tensors[0])[0]; + } + else + { + for(auto dst_tensor : dst_tensors) + { + const auto dst_ops = _adj_dst_ops.at(dst_tensor); + + for(auto dst_op : dst_ops) + { + build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors); + } + } + + return; + } + } + } + private: AdjList _adj_src_tensors{}; AdjList _adj_dst_tensors{}; AdjList _adj_src_ops{}; AdjList _adj_dst_ops{}; + + bool _last_op_available{ false }; + OperatorId _last_op{ 0 }; }; } // namespace dynamic_fusion |