diff options
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu')
14 files changed, 93 insertions, 161 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp index 1f90aab477..669913ce30 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -93,40 +93,19 @@ GpuKernelComponentStream GpuKernelComponentGraph::fuse() const { // Obtain memory descriptor map const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph); - /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup - /// Fusion can be framed as a mathematical optimization problem: - /// Given fusion constraints, find the "best" fusion patterns possible - /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one - /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end - - /// As the first iteration, we offer a sub-optimal algorithm here which ensures all - /// constraints are met, but provides no guarantee that the fusion pattern is optimal GpuKernelComponentStream stream{ _services, mem_map }; - // Break down into linear groups of components (constraint 1), preserving topological order - const auto linear_graphs = _dependency_graph.topological_partition(); + const auto op_seq = _dependency_graph.build_operators_sequence(); - // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants) - for(const auto &graph : linear_graphs) + stream.new_component_group(); + for(auto op : op_seq) { - for(unsigned int i = 0; i < graph.size(); ++i) - { - const auto comp = _components.at(graph[i].op).get(); - // Each new linear graph signals a new component group in the stream - if(i == 0) - { - stream.new_component_group(); - } - // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group - bool success = stream.add_component(comp); - if(!success) - { - stream.new_component_group(); - success = stream.add_component(comp); - ARM_COMPUTE_ERROR_ON(!success); - } - } + const auto component = _components.at(op.op).get(); + const auto success = stream.add_component(component); + ARM_COMPUTE_ERROR_ON(!success); + ARM_COMPUTE_UNUSED(success); } + return stream; } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp index 3af4c1429d..0d2574957f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -27,6 +27,8 @@ #include "arm_compute/core/Validate.h" #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include <algorithm> + namespace arm_compute { namespace experimental @@ -35,6 +37,9 @@ namespace dynamic_fusion { bool GpuKernelComponentGroup::add_component(ComponentPtr component) { + ARM_COMPUTE_ERROR_ON_MSG( + _finalized, "The component group has been finalized and cannot be altered."); + // note: Constraint 1 is guaranteed as a precondition // Constraint 2 if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) @@ -51,11 +56,6 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) { return false; } - // Constraint 3.3: Disallow multiple output components - if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output) - { - return false; - } // Constraint 4 if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) { @@ -124,55 +124,68 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) return true; } -std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const +void GpuKernelComponentGroup::finalize() { - if(_components.empty()) + if(_finalized) { - return {}; + return; } - auto src_tensors = _components[0]->tensors().get_const_src_tensors(); - auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component - for(unsigned int i = 1; i < _components.size(); ++i) + + _finalized = true; + + std::set<const ITensorInfo *> input_tensors; + std::set<const ITensorInfo *> output_tensors; + + for(auto component : _components) { - auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors(); - for(const auto src_tensor : cur_src_tensors) + const auto tensors = component->tensors(); + const auto src_tensors = tensors.get_const_src_tensors(); + const auto dst_tensors = tensors.get_const_dst_tensors(); + + // Detect input, output and intermediate tensors. + for(auto tensor : src_tensors) { - if(src_tensor->id() == prev_dst_tensor->id()) + const auto output_tensors_it = output_tensors.find(tensor); + + if(output_tensors_it != output_tensors.end()) { - continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components + // This tensor is the output of another operator. + // It must be marked as intermediate tensor. + output_tensors.erase(output_tensors_it); + _interm_tensors.insert(tensor); + } + else if(_interm_tensors.find(tensor) == _interm_tensors.end()) + { + input_tensors.insert(tensor); } - src_tensors.push_back(src_tensor); } - prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component + + for(auto tensor : dst_tensors) + { + ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end()); + ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end()); + ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end()); + output_tensors.insert(tensor); + } } - return src_tensors; + std::set_union( + input_tensors.begin(), input_tensors.end(), + output_tensors.begin(), output_tensors.end(), + std::back_inserter(_argument_tensors)); + _any_output_tensor = *output_tensors.begin(); } -std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const +const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const { - if(_components.empty()) - { - return {}; - } - const auto dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors(); - std::vector<const ITensorInfo *> dst_tensors; - for(auto tensor_ptr : dst_tensor_ptrs) - { - dst_tensors.push_back(tensor_ptr); - } - return dst_tensors; + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _any_output_tensor; } std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const { - std::vector<const ITensorInfo *> arguments; - const auto src_tensors = get_src_tensors(); - const auto dst_tensors = get_dst_tensors(); - arguments.reserve(src_tensors.size() + dst_tensors.size()); - arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end()); - arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end()); - return arguments; + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _argument_tensors; } GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const @@ -184,41 +197,10 @@ GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_componen return _components[0]; } -GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const -{ - if(empty()) - { - return nullptr; - } - return _components[_components.size() - 1]; -} - -GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const -{ - if(empty()) - { - return nullptr; - } - // Get the index of the requested component - size_t ind = 0; - for(const auto c : _components) - { - if(c->id() == id) - { - break; - } - ind++; - } - if(ind == 0 || ind >= _components.size()) - { - return nullptr; - } - return _components[ind - 1]; -} - bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const { - return is_tensor_in(tensor, get_interm_tensors()); + ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); + return _interm_tensors.find(tensor) != _interm_tensors.end(); } size_t GpuKernelComponentGroup::size() const @@ -262,30 +244,6 @@ typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuK return _components.cend(); } -std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const -{ - std::vector<const ITensorInfo *> interm_tensors{}; - for(unsigned int i = 0; i + 1 < _components.size(); ++i) - { - auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; - interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component - } - - return interm_tensors; -} - -bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors) -{ - for(auto t : tensors) - { - if(tensor->id() == t->id()) - { - return true; - } - } - return false; -} - } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h index 4c9d940594..386aefdc05 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -29,6 +29,7 @@ #include <cstdint> #include <cstdlib> #include <vector> +#include <set> namespace arm_compute { @@ -88,25 +89,16 @@ public: * @return false If the operation fails */ bool add_component(ComponentPtr component); - /** Get source tensors of this group */ - std::vector<const ITensorInfo *> get_src_tensors() const; - /** Get destination tensors of this group */ - std::vector<const ITensorInfo *> get_dst_tensors() const; + /** Optimize and pre-compute information about the component group */ + void finalize(); + /** Get one of the destination tensors of this group */ + const ITensorInfo *get_any_dst_tensor() const; /** Get tensor argument of this group * A tensor is an argument if it is a source or destination tensor to the group */ std::vector<const ITensorInfo *> get_argument_tensors() const; /** Get the root (first) component of this group */ ComponentPtr get_root_component() const; - /** Get the last component of this group */ - ComponentPtr get_last_component() const; - /** Get the previous component to the component with id @p id - * - * @param[in] id Component id of the component whose previous component is of concern - * - * @return ComponentPtr Pointer to the previous component of the one identified by @p id - */ - ComponentPtr get_previous_component(ComponentId id) const; /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group * * An intermediate tensor is any tensor that is not an argument. @@ -131,11 +123,12 @@ public: typename std::vector<ComponentPtr>::const_iterator cend() const; private: - std::vector<const ITensorInfo *> get_interm_tensors() const; - - static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors); - std::vector<ComponentPtr> _components{}; + + bool _finalized{ false }; + std::vector<const ITensorInfo *> _argument_tensors{}; + std::set<const ITensorInfo *> _interm_tensors{}; + const ITensorInfo *_any_output_tensor{ nullptr }; }; } // namespace dynamic_fusion } // namespace experimental diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp index aac84b6c59..8f4eadc477 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -44,6 +44,8 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() // Traverse through component groups and assemble workload together for(auto && group : _component_groups) { + group.finalize(); + // Write kernel code GpuLogicalKernel logical_kernel(_services, group); const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code(); diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp index e8ef835405..7bb14c8698 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -68,12 +68,12 @@ ArgumentPack<ITensorInfo> Operator::tensors() const return _tensors; } -bool GpuOperatorGroup::try_add_operator(const Operator &op) const +bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const { const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); // Constraint 1 - if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids)) + if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) { return false; } @@ -143,12 +143,12 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op) const } return true; } -void GpuOperatorGroup::add_operator(const Operator &op) +void GpuOperatorGroup::add_operator(const Operator &op, bool is_output) { - ARM_COMPUTE_ERROR_ON(!try_add_operator(op)); + ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output)); const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); - _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids); + _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output); _operators[op.id()] = op; } Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h index 35abe6c543..308a9d796a 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -77,17 +77,19 @@ public: static constexpr size_t max_fused_operators = 32; /** Try adding (without actually adding) an operator to the group * - * @param[in] op Operator to be added + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. * * @return true If @p op can be added while maintaining the invariants * @return false Otherwise */ - bool try_add_operator(const Operator &op) const; + bool try_add_operator(const Operator &op, bool is_output = false) const; /** Add an operator to the group * - * @param[in] op Operator to be added + * @param[in] op Operator to be added + * @param[in] is_output Whether this operator is the output operator. */ - void add_operator(const Operator &op); + void add_operator(const Operator &op, bool is_output = false); /** Create a new operator * * @param[in] operator_type @ref GpuOperatorType of the new operator diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp index 017536df6c..60c2281433 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp @@ -81,7 +81,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const auto group = sketch.implementation().operator_group(); const auto op = group.new_operator(operator_type, tensors); - const auto success = group.try_add_operator(op); + const auto success = group.try_add_operator(op, true); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload."); ARM_COMPUTE_UNUSED(success); @@ -133,7 +133,7 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch, tensors.add_const_tensor(ACL_DST_0, dst); const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); - sketch.implementation().operator_group().add_operator(op); + sketch.implementation().operator_group().add_operator(op, true); } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp index c3128ea552..8adf056912 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp @@ -125,7 +125,7 @@ TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, c lut["src"] = vtable.get_variable(_src); lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp index 1ac49406a8..6ab3a68bb0 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp @@ -137,7 +137,7 @@ TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const C lut["src"] = vtable.get_variable(_src); lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp index 389bd5c65f..6fa77aafe3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -251,7 +251,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab } lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index aa324ffb54..221addb7b5 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -268,7 +268,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, } lut["dst"] = vtable.get_variable(_dst); - const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor()); lut["arg_dst"] = dst_argument.uniq_name; // Local build options diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index 6c1e0fb1de..39cec6e31c 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -194,7 +194,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["lhs"] = vtable.get_variable(_lhs); lut["rhs"] = vtable.get_variable(_rhs); lut["dst"] = vtable.get_variable(_dst); - lut["out"] = vtable.get_variable(comp_group.get_dst_tensors().front()); + lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor()); } else { diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index e4b662b3a8..ef4f2f22a1 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -84,9 +84,9 @@ TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const // Local build options lut["meta_kernel_id"] = id(); lut["DST_TENSOR_TYPE"] = "BUFFER"; - const auto dst_info = comp_group.get_dst_tensors()[0]; - lut["DST_DATA_TYPE"] = dst_info->data_type(); + lut["DST_DATA_TYPE"] = _dst->data_type(); + ARM_COMPUTE_UNUSED(comp_group); return lut; } diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp index 0afd0e7581..eed481f109 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -203,9 +203,7 @@ std::string ClTemplateWriter::write_code() } std::string ClTemplateWriter::write_global_section() const { - const auto dst_tensors = _components.get_dst_tensors(); - ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed"); - const auto dst_info = dst_tensors[0]; + const auto dst_info = _components.get_any_dst_tensor(); const auto dst_w = dst_info->dimension(0); const auto tile_w = std::max(1, get_window().x().step()); const auto tile_h = std::max(1, get_window().y().step()); |