aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2022-12-14 14:49:56 +0000
committerViet-Hoa Do <viet-hoa.do@arm.com>2022-12-23 14:11:34 +0000
commit04f4620cf999846a44089c81720aa920edec6993 (patch)
tree1c0080ac59d5b2aa500cd2b2ceffe0575e22a4b6
parent81fdaddaf36cb4c7ff0d2c52a370dd977a13dc72 (diff)
downloadComputeLibrary-04f4620cf999846a44089c81720aa920edec6993.tar.gz
Add multiple output support for dynamic fusion
* The dependency graph now can schedule any acyclic graph into a sequential list of operators. This is needed as the output operators now form branches in the graph. * Fix the definition of input, output and intermediate tensors in GpuKernelComponentGroup to support non-linear but sequential list of operators. * Add constraint on GpuOperatorGroup to enforce strictly linear fusion style, but allow output operator as the only form of branch. Resolves: COMPMID-5771 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I68de3a31a2456145081f0a397e4e61dd66327682 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8823 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp37
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp146
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h27
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp2
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp10
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h10
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp4
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp2
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp2
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp2
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp2
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp2
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp4
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp4
-rw-r--r--src/dynamic_fusion/sketch/utils/DependencyGraph.h340
-rw-r--r--tests/validation/dynamic_fusion/gpu/Integration.cpp101
16 files changed, 346 insertions, 349 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index 1f90aab477..669913ce30 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -93,40 +93,19 @@ GpuKernelComponentStream GpuKernelComponentGraph::fuse() const
{
// Obtain memory descriptor map
const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph);
- /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup
- /// Fusion can be framed as a mathematical optimization problem:
- /// Given fusion constraints, find the "best" fusion patterns possible
- /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one
- /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end
-
- /// As the first iteration, we offer a sub-optimal algorithm here which ensures all
- /// constraints are met, but provides no guarantee that the fusion pattern is optimal
GpuKernelComponentStream stream{ _services, mem_map };
- // Break down into linear groups of components (constraint 1), preserving topological order
- const auto linear_graphs = _dependency_graph.topological_partition();
+ const auto op_seq = _dependency_graph.build_operators_sequence();
- // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants)
- for(const auto &graph : linear_graphs)
+ stream.new_component_group();
+ for(auto op : op_seq)
{
- for(unsigned int i = 0; i < graph.size(); ++i)
- {
- const auto comp = _components.at(graph[i].op).get();
- // Each new linear graph signals a new component group in the stream
- if(i == 0)
- {
- stream.new_component_group();
- }
- // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group
- bool success = stream.add_component(comp);
- if(!success)
- {
- stream.new_component_group();
- success = stream.add_component(comp);
- ARM_COMPUTE_ERROR_ON(!success);
- }
- }
+ const auto component = _components.at(op.op).get();
+ const auto success = stream.add_component(component);
+ ARM_COMPUTE_ERROR_ON(!success);
+ ARM_COMPUTE_UNUSED(success);
}
+
return stream;
}
} // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 3af4c1429d..0d2574957f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -27,6 +27,8 @@
#include "arm_compute/core/Validate.h"
#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <algorithm>
+
namespace arm_compute
{
namespace experimental
@@ -35,6 +37,9 @@ namespace dynamic_fusion
{
bool GpuKernelComponentGroup::add_component(ComponentPtr component)
{
+ ARM_COMPUTE_ERROR_ON_MSG(
+ _finalized, "The component group has been finalized and cannot be altered.");
+
// note: Constraint 1 is guaranteed as a precondition
// Constraint 2
if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
@@ -51,11 +56,6 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component)
{
return false;
}
- // Constraint 3.3: Disallow multiple output components
- if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output)
- {
- return false;
- }
// Constraint 4
if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
{
@@ -124,55 +124,68 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component)
return true;
}
-std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const
+void GpuKernelComponentGroup::finalize()
{
- if(_components.empty())
+ if(_finalized)
{
- return {};
+ return;
}
- auto src_tensors = _components[0]->tensors().get_const_src_tensors();
- auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
- for(unsigned int i = 1; i < _components.size(); ++i)
+
+ _finalized = true;
+
+ std::set<const ITensorInfo *> input_tensors;
+ std::set<const ITensorInfo *> output_tensors;
+
+ for(auto component : _components)
{
- auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors();
- for(const auto src_tensor : cur_src_tensors)
+ const auto tensors = component->tensors();
+ const auto src_tensors = tensors.get_const_src_tensors();
+ const auto dst_tensors = tensors.get_const_dst_tensors();
+
+ // Detect input, output and intermediate tensors.
+ for(auto tensor : src_tensors)
{
- if(src_tensor->id() == prev_dst_tensor->id())
+ const auto output_tensors_it = output_tensors.find(tensor);
+
+ if(output_tensors_it != output_tensors.end())
{
- continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components
+ // This tensor is the output of another operator.
+ // It must be marked as intermediate tensor.
+ output_tensors.erase(output_tensors_it);
+ _interm_tensors.insert(tensor);
+ }
+ else if(_interm_tensors.find(tensor) == _interm_tensors.end())
+ {
+ input_tensors.insert(tensor);
}
- src_tensors.push_back(src_tensor);
}
- prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+
+ for(auto tensor : dst_tensors)
+ {
+ ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end());
+ ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
+ ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
+ output_tensors.insert(tensor);
+ }
}
- return src_tensors;
+ std::set_union(
+ input_tensors.begin(), input_tensors.end(),
+ output_tensors.begin(), output_tensors.end(),
+ std::back_inserter(_argument_tensors));
+ _any_output_tensor = *output_tensors.begin();
}
-std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const
+const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
{
- if(_components.empty())
- {
- return {};
- }
- const auto dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors();
- std::vector<const ITensorInfo *> dst_tensors;
- for(auto tensor_ptr : dst_tensor_ptrs)
- {
- dst_tensors.push_back(tensor_ptr);
- }
- return dst_tensors;
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _any_output_tensor;
}
std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
{
- std::vector<const ITensorInfo *> arguments;
- const auto src_tensors = get_src_tensors();
- const auto dst_tensors = get_dst_tensors();
- arguments.reserve(src_tensors.size() + dst_tensors.size());
- arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end());
- arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end());
- return arguments;
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _argument_tensors;
}
GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
@@ -184,41 +197,10 @@ GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_componen
return _components[0];
}
-GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const
-{
- if(empty())
- {
- return nullptr;
- }
- return _components[_components.size() - 1];
-}
-
-GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const
-{
- if(empty())
- {
- return nullptr;
- }
- // Get the index of the requested component
- size_t ind = 0;
- for(const auto c : _components)
- {
- if(c->id() == id)
- {
- break;
- }
- ind++;
- }
- if(ind == 0 || ind >= _components.size())
- {
- return nullptr;
- }
- return _components[ind - 1];
-}
-
bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
{
- return is_tensor_in(tensor, get_interm_tensors());
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _interm_tensors.find(tensor) != _interm_tensors.end();
}
size_t GpuKernelComponentGroup::size() const
@@ -262,30 +244,6 @@ typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuK
return _components.cend();
}
-std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const
-{
- std::vector<const ITensorInfo *> interm_tensors{};
- for(unsigned int i = 0; i + 1 < _components.size(); ++i)
- {
- auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0];
- interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component
- }
-
- return interm_tensors;
-}
-
-bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors)
-{
- for(auto t : tensors)
- {
- if(tensor->id() == t->id())
- {
- return true;
- }
- }
- return false;
-}
-
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
index 4c9d940594..386aefdc05 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -29,6 +29,7 @@
#include <cstdint>
#include <cstdlib>
#include <vector>
+#include <set>
namespace arm_compute
{
@@ -88,25 +89,16 @@ public:
* @return false If the operation fails
*/
bool add_component(ComponentPtr component);
- /** Get source tensors of this group */
- std::vector<const ITensorInfo *> get_src_tensors() const;
- /** Get destination tensors of this group */
- std::vector<const ITensorInfo *> get_dst_tensors() const;
+ /** Optimize and pre-compute information about the component group */
+ void finalize();
+ /** Get one of the destination tensors of this group */
+ const ITensorInfo *get_any_dst_tensor() const;
/** Get tensor argument of this group
* A tensor is an argument if it is a source or destination tensor to the group
*/
std::vector<const ITensorInfo *> get_argument_tensors() const;
/** Get the root (first) component of this group */
ComponentPtr get_root_component() const;
- /** Get the last component of this group */
- ComponentPtr get_last_component() const;
- /** Get the previous component to the component with id @p id
- *
- * @param[in] id Component id of the component whose previous component is of concern
- *
- * @return ComponentPtr Pointer to the previous component of the one identified by @p id
- */
- ComponentPtr get_previous_component(ComponentId id) const;
/** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
*
* An intermediate tensor is any tensor that is not an argument.
@@ -131,11 +123,12 @@ public:
typename std::vector<ComponentPtr>::const_iterator cend() const;
private:
- std::vector<const ITensorInfo *> get_interm_tensors() const;
-
- static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors);
-
std::vector<ComponentPtr> _components{};
+
+ bool _finalized{ false };
+ std::vector<const ITensorInfo *> _argument_tensors{};
+ std::set<const ITensorInfo *> _interm_tensors{};
+ const ITensorInfo *_any_output_tensor{ nullptr };
};
} // namespace dynamic_fusion
} // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index aac84b6c59..8f4eadc477 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -44,6 +44,8 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
// Traverse through component groups and assemble workload together
for(auto && group : _component_groups)
{
+ group.finalize();
+
// Write kernel code
GpuLogicalKernel logical_kernel(_services, group);
const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
index e8ef835405..7bb14c8698 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -68,12 +68,12 @@ ArgumentPack<ITensorInfo> Operator::tensors() const
return _tensors;
}
-bool GpuOperatorGroup::try_add_operator(const Operator &op) const
+bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const
{
const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
// Constraint 1
- if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids))
+ if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
{
return false;
}
@@ -143,12 +143,12 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op) const
}
return true;
}
-void GpuOperatorGroup::add_operator(const Operator &op)
+void GpuOperatorGroup::add_operator(const Operator &op, bool is_output)
{
- ARM_COMPUTE_ERROR_ON(!try_add_operator(op));
+ ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output));
const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
- _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids);
+ _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
_operators[op.id()] = op;
}
Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
index 35abe6c543..308a9d796a 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -77,17 +77,19 @@ public:
static constexpr size_t max_fused_operators = 32;
/** Try adding (without actually adding) an operator to the group
*
- * @param[in] op Operator to be added
+ * @param[in] op Operator to be added
+ * @param[in] is_output Whether this operator is the output operator.
*
* @return true If @p op can be added while maintaining the invariants
* @return false Otherwise
*/
- bool try_add_operator(const Operator &op) const;
+ bool try_add_operator(const Operator &op, bool is_output = false) const;
/** Add an operator to the group
*
- * @param[in] op Operator to be added
+ * @param[in] op Operator to be added
+ * @param[in] is_output Whether this operator is the output operator.
*/
- void add_operator(const Operator &op);
+ void add_operator(const Operator &op, bool is_output = false);
/** Create a new operator
*
* @param[in] operator_type @ref GpuOperatorType of the new operator
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
index 017536df6c..60c2281433 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
@@ -81,7 +81,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
const auto group = sketch.implementation().operator_group();
const auto op = group.new_operator(operator_type, tensors);
- const auto success = group.try_add_operator(op);
+ const auto success = group.try_add_operator(op, true);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload.");
ARM_COMPUTE_UNUSED(success);
@@ -133,7 +133,7 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch,
tensors.add_const_tensor(ACL_DST_0, dst);
const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
- sketch.implementation().operator_group().add_operator(op);
+ sketch.implementation().operator_group().add_operator(op, true);
}
} // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index c3128ea552..8adf056912 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -125,7 +125,7 @@ TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, c
lut["src"] = vtable.get_variable(_src);
lut["dst"] = vtable.get_variable(_dst);
- const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+ const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
lut["arg_dst"] = dst_argument.uniq_name;
// Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 1ac49406a8..6ab3a68bb0 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -137,7 +137,7 @@ TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const C
lut["src"] = vtable.get_variable(_src);
lut["dst"] = vtable.get_variable(_dst);
- const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+ const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
lut["arg_dst"] = dst_argument.uniq_name;
// Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index 389bd5c65f..6fa77aafe3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -251,7 +251,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
}
lut["dst"] = vtable.get_variable(_dst);
- const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+ const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
lut["arg_dst"] = dst_argument.uniq_name;
// Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index aa324ffb54..221addb7b5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -268,7 +268,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
}
lut["dst"] = vtable.get_variable(_dst);
- const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+ const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
lut["arg_dst"] = dst_argument.uniq_name;
// Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 6c1e0fb1de..39cec6e31c 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -194,7 +194,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
lut["lhs"] = vtable.get_variable(_lhs);
lut["rhs"] = vtable.get_variable(_rhs);
lut["dst"] = vtable.get_variable(_dst);
- lut["out"] = vtable.get_variable(comp_group.get_dst_tensors().front());
+ lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor());
}
else
{
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index e4b662b3a8..ef4f2f22a1 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -84,9 +84,9 @@ TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const
// Local build options
lut["meta_kernel_id"] = id();
lut["DST_TENSOR_TYPE"] = "BUFFER";
- const auto dst_info = comp_group.get_dst_tensors()[0];
- lut["DST_DATA_TYPE"] = dst_info->data_type();
+ lut["DST_DATA_TYPE"] = _dst->data_type();
+ ARM_COMPUTE_UNUSED(comp_group);
return lut;
}
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index 0afd0e7581..eed481f109 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -203,9 +203,7 @@ std::string ClTemplateWriter::write_code()
}
std::string ClTemplateWriter::write_global_section() const
{
- const auto dst_tensors = _components.get_dst_tensors();
- ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
- const auto dst_info = dst_tensors[0];
+ const auto dst_info = _components.get_any_dst_tensor();
const auto dst_w = dst_info->dimension(0);
const auto tile_w = std::max(1, get_window().x().step());
const auto tile_h = std::max(1, get_window().y().step());
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
index 633c5e4263..c891e76d8b 100644
--- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -25,9 +25,7 @@
#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
#include "arm_compute/core/Error.h"
-#include <algorithm>
#include <cstdint>
-#include <deque>
#include <map>
#include <set>
#include <tuple>
@@ -84,40 +82,84 @@ public:
friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &);
/** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list
- * @note The list is expected to only grow from head to tail
+ *
+ * Rule: If the new operator is not the first operator, at least one input tensor must be
+ * the output tensor of the last non-output operator. All other input tensors must be
+ * the global input of the graph (i.e. not the output of any operator).
+ *
+ * Rule: The output tensor of the new operator must not be the input tensor of any previously
+ * added operator.
*
* PRECONDITION: The current graph is already linear
*
* @return true If the operator can be added while keeping the graph as a linear sequence
* @return false Otherwise
*/
- bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) const
+ bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) const
{
- ARM_COMPUTE_UNUSED(op, outputs);
+ ARM_COMPUTE_UNUSED(op, is_output);
if(all_ops().empty())
{
return true;
}
- std::vector<TensorId> common_tensors{};
- auto existing_tensors = all_tensors();
- std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted
- std::vector<TensorId> sorted_inputs{ inputs };
- std::sort(sorted_inputs.begin(), sorted_inputs.end());
- std::set_intersection(existing_tensors.begin(), existing_tensors.end(),
- sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors));
- if(common_tensors.size() != 1U)
+
+ // If the new operator is not the first operator, at least one input tensor must be
+ // the output tensor of the last non-output operator. All other input tensors must be
+ // the global input of the graph (i.e. not the output of any operator).
+ if(_last_op_available)
{
- return false;
+ auto use_input_from_last_op = false;
+
+ for(auto src_tensor : inputs)
+ {
+ const auto src_ops = _adj_src_ops.find(src_tensor);
+
+ if(src_ops != _adj_src_ops.end())
+ {
+ ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1);
+
+ if(!src_ops->second.empty())
+ {
+ const auto src_op = src_ops->second[0];
+
+ if(src_op == _last_op)
+ {
+ if(use_input_from_last_op)
+ {
+ // To be safe, we also forbid using the output tensor
+ // of the last operator twice.
+ return false;
+ }
+
+ use_input_from_last_op = true;
+ }
+ else
+ {
+ // The input tensor of this operator must not be the output tensor
+ // of any other operator except the last non-output operator.
+ return false;
+ }
+ }
+ }
+ }
+
+ if(!use_input_from_last_op)
+ {
+ // At least one input tensor must be the output tensor of the last non-output operator.
+ return false;
+ }
}
- const auto linked_tensor = common_tensors[0];
- const auto tail_ops = get_dst_ops();
- ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION
- const auto tail = tail_ops[0];
- if(!is_in(linked_tensor, dst_tensors(tail)))
+ // The output tensor of the new operator must not be the input tensor of any previously
+ // added operator.
+ for(auto dst_tensor : outputs)
{
- return false;
+ if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
+ {
+ return false;
+ }
}
+
return true;
}
/** Add an operator, while keeping the graph as a "linear sequence"
@@ -126,10 +168,9 @@ public:
* INVARIANT: The list can only grow from head to tail
* INVARIANT: POSTCONDITION: The graph is linear
*/
- void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+ void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
{
- ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs));
- auto success = add_operator(op, inputs, outputs);
+ const auto success = add_operator(op, inputs, outputs, is_output);
ARM_COMPUTE_UNUSED(success);
ARM_COMPUTE_ERROR_ON(!success);
}
@@ -137,11 +178,12 @@ public:
* Return invalid if it violates the DAG invariant
* Invalid operation will not change the graph
*
- * @param[in] op Operator to add
- * @param[in] inputs Input tensors to the operator
- * @param[in] outputs Output tensors to the operator
+ * @param[in] op Operator to add
+ * @param[in] inputs Input tensors to the operator
+ * @param[in] outputs Output tensors to the operator
+ * @param[in] is_output Whether this is an output operator
*/
- bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+ bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
{
if(operator_exists(op))
{
@@ -169,182 +211,41 @@ public:
}
}
- return true;
- }
-
- /** Sort the graph in a topological order
- *
- * @return std::vector<OpPack>
- */
- std::vector<OpPack> topological_sort() const
- {
- // Incident degree (number of source operators to an op)
- std::map<OperatorId, unsigned int> in_degree{};
- std::set<OperatorId> visited_ops{};
- std::deque<OperatorId> zero_in_degree_ops{};
- std::vector<OpPack> sorted_op_packs{};
- for(auto op : all_ops())
- {
- const auto degree = src_ops(op).size();
- in_degree[op] = degree;
- if(degree == 0)
- {
- zero_in_degree_ops.push_back(op);
- visited_ops.insert(op);
- }
- }
-
- while(!zero_in_degree_ops.empty())
+ if(!is_output)
{
- const OperatorId op = zero_in_degree_ops.front();
- zero_in_degree_ops.pop_front();
- sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
-
- for(const auto next_op : dst_ops(op))
- {
- if(in_degree[next_op] > 0)
- {
- in_degree[next_op]--;
- }
- if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
- {
- zero_in_degree_ops.push_back(next_op);
- visited_ops.insert(op);
- }
- }
+ _last_op_available = true;
+ _last_op = op;
}
- return sorted_op_packs;
+ return true;
}
- void find_independent_paths_util(OperatorId op, std::vector<std::vector<OperatorId>> &paths, std::vector<OperatorId> cur_path,
- const std::map<OperatorId, unsigned int> &in_degree) const
- {
- // We have found an unresolved dependency
- if(in_degree.at(op) > 1)
- {
- paths.push_back(cur_path);
- return;
- }
- const auto child_ops = dst_ops(op);
-
- cur_path.push_back(op);
- // Hit the leaf op
- if(child_ops.empty())
- {
- paths.push_back(cur_path);
- return;
- }
- for(const auto child_op : child_ops)
- {
- find_independent_paths_util(child_op, paths, cur_path, in_degree);
- }
- }
- /** Find all independent linear paths from op, which doesn't depend on any other op
+ /** Build a sequence of operators from the acyclic graph of operators.
*
- * @return std::vector<OpPack>
+ * The graph will be visited in depth-first strategy. The operator can only be added to
+ * the sequence when all operators that supply the input tensors have been added. Otherwise,
+ * the operator will be ignored and later visited again. In other words, the dependency between
+ * operators will be preserved in the sequence.
*/
- std::vector<std::vector<OperatorId>> find_independent_paths(OperatorId op,
- const std::map<OperatorId, unsigned int> &in_degree) const
+ std::vector<OpPack> build_operators_sequence() const
{
- std::vector<std::vector<OperatorId>> paths;
- std::vector<OperatorId> cur_path;
- find_independent_paths_util(op, paths, cur_path, in_degree);
- return paths;
- }
- /** Find a longest linear path from op, which doesn't depend on any other op
- *
- * @return std::vector<OpPack>
- */
- std::vector<OperatorId> find_longest_independent_path(OperatorId op,
- const std::map<OperatorId, unsigned int> &in_degree) const
- {
- const auto &paths = find_independent_paths(op, in_degree);
- ARM_COMPUTE_ERROR_ON(paths.empty());
- size_t max_len = 0;
- const std::vector<OperatorId> *max_path = nullptr;
+ std::vector<OpPack> ops_seq;
+ std::set<Id> done_ops;
+ std::set<Id> done_tensors;
- for(const auto &path : paths)
- {
- if(path.size() >= max_len)
- {
- max_path = &path;
- max_len = path.size();
- }
- }
- return *max_path;
- }
- std::vector<OperatorId> propose_next_path(std::set<OperatorId> &candidate_ops,
- const std::map<OperatorId, unsigned int> &in_degree) const
- {
- if(candidate_ops.empty())
- {
- return {};
- }
- size_t max_len = 0;
- std::vector<OperatorId> max_path;
- OperatorId chosen_op{};
- for(auto op : candidate_ops)
- {
- const auto path = find_longest_independent_path(op, in_degree);
- if(path.size() >= max_len)
- {
- chosen_op = op;
- max_path = path;
- max_len = path.size();
- }
- }
- candidate_ops.erase(chosen_op);
- return max_path;
- }
- /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize
- * the number of partitions
- */
- std::vector<std::vector<OpPack>> topological_partition() const
- {
- // Initialize zero incident degree and zero in degree ops
- std::map<OperatorId, unsigned int> in_degree{};
- std::set<OperatorId> candidate_ops{};
- for(auto op : all_ops())
- {
- const auto degree = src_ops(op).size();
- in_degree[op] = degree;
- if(degree == 0)
- {
- candidate_ops.insert(op);
- }
- }
+ const auto input_tensors = global_src_tensors();
- std::vector<std::vector<OpPack>> sorted_partitions{};
- while(!candidate_ops.empty())
+ for(auto tensor : input_tensors)
{
- // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops)
- const auto path = propose_next_path(candidate_ops, in_degree);
+ done_tensors.insert(tensor);
- // Append to sorted_partitions
- std::vector<OpPack> path_op_pack{};
- for(auto op : path)
+ for(auto op : _adj_dst_ops.at(tensor))
{
- path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
- }
- sorted_partitions.push_back(path_op_pack);
- // Remove whole path (Update in_degree, visited_ops, candidate_ops)
- for(auto op : path)
- {
- for(const auto next_op_child : dst_ops(op))
- {
- if(in_degree[next_op_child] > 0)
- {
- in_degree[next_op_child]--;
- }
- if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates
- {
- candidate_ops.insert(next_op_child);
- }
- }
+ build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors);
}
}
- return sorted_partitions;
+
+ return ops_seq;
}
/** Strict equality comparison (all internal ids and order of insertion matter).
@@ -673,11 +574,74 @@ private:
return false;
}
+ void build_operators_sequence_from_op(
+ Id op,
+ std::vector<OpPack> &ops_seq,
+ std::set<Id> &done_ops,
+ std::set<Id> &done_tensors) const
+ {
+ while(true)
+ {
+ // If the operator has been added to the sequence, ignore it.
+ if(done_ops.find(op) != done_ops.end())
+ {
+ return;
+ }
+
+ // If not all the input tensors of the operator are available, this operator cannot be
+ // added to the sequence for now. It will be visited again after the source operator
+ // is added to the sequence.
+ const auto src_tensors = _adj_src_tensors.at(op);
+
+ for(auto src : src_tensors)
+ {
+ if(done_tensors.find(src) == done_tensors.end())
+ {
+ return;
+ }
+ }
+
+ // This operator is ready to be added to the sequence.
+ const auto dst_tensors = _adj_dst_tensors.at(op);
+
+ done_ops.insert(op);
+
+ OpPack pack{ op, src_tensors, dst_tensors };
+ ops_seq.push_back(pack);
+
+ done_tensors.insert(dst_tensors.begin(), dst_tensors.end());
+
+ // Visit all the sink operators.
+ // Call this function recursively unless there is only one sink.
+ if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
+ {
+ op = _adj_dst_ops.at(dst_tensors[0])[0];
+ }
+ else
+ {
+ for(auto dst_tensor : dst_tensors)
+ {
+ const auto dst_ops = _adj_dst_ops.at(dst_tensor);
+
+ for(auto dst_op : dst_ops)
+ {
+ build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors);
+ }
+ }
+
+ return;
+ }
+ }
+ }
+
private:
AdjList _adj_src_tensors{};
AdjList _adj_dst_tensors{};
AdjList _adj_src_ops{};
AdjList _adj_dst_ops{};
+
+ bool _last_op_available{ false };
+ OperatorId _last_op{ 0 };
};
} // namespace dynamic_fusion
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index 58d2215e64..a5716ce1e1 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
#include "tests/CL/CLAccessor.h"
@@ -36,6 +37,7 @@
#include "tests/validation/dynamic_fusion/Utils.h"
#include "tests/validation/reference/ConvolutionLayer.h"
#include "tests/validation/reference/Permute.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
using namespace arm_compute::experimental::dynamic_fusion;
using namespace arm_compute::test::validation::utils;
@@ -137,6 +139,105 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL)
RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
}
+TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out_0 = in_0 + in_1
+ * out_1 = out_0 + in_2
+ */
+ CLScheduler::get().default_reinit();
+
+ const auto data_type = DataType::F32;
+ const auto t_input_shape = TensorShape(8, 2, 1);
+
+ // Create a new workload sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &gpu_ctx };
+
+ auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+ auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+ auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+
+ auto out_0_info = sketch.create_tensor_info();
+ auto out_1_info = sketch.create_tensor_info();
+
+ auto ans_0_info = sketch.create_tensor_info();
+ auto ans_1_info = sketch.create_tensor_info();
+
+ GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info);
+ GpuOutput::create_op(sketch, &ans_0_info, &out_0_info);
+ GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info);
+ GpuOutput::create_op(sketch, &ans_1_info, &out_1_info);
+
+ // Configure runtime
+ ClWorkloadRuntime runtime;
+ runtime.configure(sketch);
+
+ // (Important) Allocate auxiliary tensor memory if there are any
+ // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+ for(auto &data : runtime.get_auxiliary_tensors())
+ {
+ CLTensor *tensor = data.first;
+ AuxMemoryInfo aux_mem_req = data.second;
+ tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+ tensor->allocator()->allocate(); // Use ACL allocated memory
+ // auto buf = cl::Buffer();
+ // tensor->allocator()->import_memory(buf); // Or, import external memory
+ }
+
+ // Construct user tensors
+ CLTensor t_in_0{};
+ CLTensor t_in_1{};
+ CLTensor t_in_2{};
+
+ CLTensor t_out_0{};
+ CLTensor t_out_1{};
+
+ // Initialize user tensors
+ t_in_0.allocator()->init(in_0_info);
+ t_in_1.allocator()->init(in_1_info);
+ t_in_2.allocator()->init(in_2_info);
+
+ t_out_0.allocator()->init(out_0_info);
+ t_out_1.allocator()->init(out_1_info);
+
+ // Allocate and fill user tensors
+ // Instead of using ACL allocator, the user can choose to import memory into the tensors
+ t_in_0.allocator()->allocate();
+ t_in_1.allocator()->allocate();
+ t_in_2.allocator()->allocate();
+
+ t_out_0.allocator()->allocate();
+ t_out_1.allocator()->allocate();
+
+ fill<float>(CLAccessor(t_in_0), 0, library.get());
+ fill<float>(CLAccessor(t_in_1), 1, library.get());
+ fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+ // Run runtime
+ runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 });
+
+ // Create reference
+ SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+ SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+ SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+ SimpleTensor<float> ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+ SimpleTensor<float> ref_t_out_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+ // Fill reference
+ fill<float>(ref_t_in_0, 0, library.get());
+ fill<float>(ref_t_in_1, 1, library.get());
+ fill<float>(ref_t_in_2, 2, library.get());
+
+ reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+ reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_out_1, ConvertPolicy::WRAP);
+
+ RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+ validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
+ validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
+}
TEST_SUITE(Invalid_Fusion_Should_Fail)
TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
{