14 files changed, 93 insertions, 161 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index 1f90aab477..669913ce30 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -93,40 +93,19 @@ GpuKernelComponentStream GpuKernelComponentGraph::fuse() const
 {
     // Obtain memory descriptor map
     const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph);
-    /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup
-    /// Fusion can be framed as a mathematical optimization problem:
-    /// Given fusion constraints, find the "best" fusion patterns possible
-    /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one
-    /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end
-
-    /// As the first iteration, we offer a sub-optimal algorithm here which ensures all
-    /// constraints are met, but provides no guarantee that the fusion pattern is optimal
 
     GpuKernelComponentStream stream{ _services, mem_map };
-    // Break down into linear groups of components (constraint 1), preserving topological order
-    const auto linear_graphs = _dependency_graph.topological_partition();
+    const auto op_seq = _dependency_graph.build_operators_sequence();
 
-    // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants)
-    for(const auto &graph : linear_graphs)
+    stream.new_component_group();
+    for(auto op : op_seq)
     {
-        for(unsigned int i = 0; i < graph.size(); ++i)
-        {
-            const auto comp = _components.at(graph[i].op).get();
-            // Each new linear graph signals a new component group in the stream
-            if(i == 0)
-            {
-                stream.new_component_group();
-            }
-            // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group
-            bool success = stream.add_component(comp);
-            if(!success)
-            {
-                stream.new_component_group();
-                success = stream.add_component(comp);
-                ARM_COMPUTE_ERROR_ON(!success);
-            }
-        }
+        const auto component = _components.at(op.op).get();
+        const auto success = stream.add_component(component);
+        ARM_COMPUTE_ERROR_ON(!success);
+        ARM_COMPUTE_UNUSED(success);
     }
+
     return stream;
 }
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 3af4c1429d..0d2574957f 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -27,6 +27,8 @@
 #include "arm_compute/core/Validate.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 namespace experimental
@@ -35,6 +37,9 @@ namespace dynamic_fusion
 {
 bool GpuKernelComponentGroup::add_component(ComponentPtr component)
 {
+    ARM_COMPUTE_ERROR_ON_MSG(
+        _finalized, "The component group has been finalized and cannot be altered.");
+
     // note: Constraint 1 is guaranteed as a precondition
     // Constraint 2
     if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
@@ -51,11 +56,6 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component)
     {
         return false;
     }
-    // Constraint 3.3: Disallow multiple output components
-    if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output)
-    {
-        return false;
-    }
     // Constraint 4
     if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
     {
@@ -124,55 +124,68 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component)
     return true;
 }
 
-std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const
+void GpuKernelComponentGroup::finalize()
 {
-    if(_components.empty())
+    if(_finalized)
     {
-        return {};
+        return;
     }
-    auto src_tensors     = _components[0]->tensors().get_const_src_tensors();
-    auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
-    for(unsigned int i = 1; i < _components.size(); ++i)
+
+    _finalized = true;
+
+    std::set<const ITensorInfo *> input_tensors;
+    std::set<const ITensorInfo *> output_tensors;
+
+    for(auto component : _components)
     {
-        auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors();
-        for(const auto src_tensor : cur_src_tensors)
+        const auto tensors = component->tensors();
+        const auto src_tensors = tensors.get_const_src_tensors();
+        const auto dst_tensors = tensors.get_const_dst_tensors();
+
+        // Detect input, output and intermediate tensors.
+        for(auto tensor : src_tensors)
         {
-            if(src_tensor->id() == prev_dst_tensor->id())
+            const auto output_tensors_it = output_tensors.find(tensor);
+
+            if(output_tensors_it != output_tensors.end())
             {
-                continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components
+                // This tensor is the output of another operator.
+                // It must be marked as intermediate tensor.
+                output_tensors.erase(output_tensors_it);
+                _interm_tensors.insert(tensor);
+            }
+            else if(_interm_tensors.find(tensor) == _interm_tensors.end())
+            {
+                input_tensors.insert(tensor);
             }
-            src_tensors.push_back(src_tensor);
         }
-        prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+
+        for(auto tensor : dst_tensors)
+        {
+            ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end());
+            ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
+            ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
+            output_tensors.insert(tensor);
+        }
     }
 
-    return src_tensors;
+    std::set_union(
+        input_tensors.begin(), input_tensors.end(),
+        output_tensors.begin(), output_tensors.end(),
+        std::back_inserter(_argument_tensors));
+    _any_output_tensor = *output_tensors.begin();
 }
 
-std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const
+const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
 {
-    if(_components.empty())
-    {
-        return {};
-    }
-    const auto                       dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors();
-    std::vector<const ITensorInfo *> dst_tensors;
-    for(auto tensor_ptr : dst_tensor_ptrs)
-    {
-        dst_tensors.push_back(tensor_ptr);
-    }
-    return dst_tensors;
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _any_output_tensor;
 }
 
 std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
 {
-    std::vector<const ITensorInfo *> arguments;
-    const auto                       src_tensors = get_src_tensors();
-    const auto                       dst_tensors = get_dst_tensors();
-    arguments.reserve(src_tensors.size() + dst_tensors.size());
-    arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end());
-    arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end());
-    return arguments;
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _argument_tensors;
 }
 
 GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
@@ -184,41 +197,10 @@ GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_componen
     return _components[0];
 }
 
-GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const
-{
-    if(empty())
-    {
-        return nullptr;
-    }
-    return _components[_components.size() - 1];
-}
-
-GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const
-{
-    if(empty())
-    {
-        return nullptr;
-    }
-    // Get the index of the requested component
-    size_t ind = 0;
-    for(const auto c : _components)
-    {
-        if(c->id() == id)
-        {
-            break;
-        }
-        ind++;
-    }
-    if(ind == 0 || ind >= _components.size())
-    {
-        return nullptr;
-    }
-    return _components[ind - 1];
-}
-
 bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
 {
-    return is_tensor_in(tensor, get_interm_tensors());
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _interm_tensors.find(tensor) != _interm_tensors.end();
 }
 
 size_t GpuKernelComponentGroup::size() const
@@ -262,30 +244,6 @@ typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuK
     return _components.cend();
 }
 
-std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const
-{
-    std::vector<const ITensorInfo *> interm_tensors{};
-    for(unsigned int i = 0; i + 1 < _components.size(); ++i)
-    {
-        auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0];
-        interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component
-    }
-
-    return interm_tensors;
-}
-
-bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors)
-{
-    for(auto t : tensors)
-    {
-        if(tensor->id() == t->id())
-        {
-            return true;
-        }
-    }
-    return false;
-}
-
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
index 4c9d940594..386aefdc05 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -29,6 +29,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <vector>
+#include <set>
 
 namespace arm_compute
 {
@@ -88,25 +89,16 @@ public:
      * @return false     If the operation fails
      */
     bool add_component(ComponentPtr component);
-    /** Get source tensors of this group */
-    std::vector<const ITensorInfo *> get_src_tensors() const;
-    /** Get destination tensors of this group */
-    std::vector<const ITensorInfo *> get_dst_tensors() const;
+    /** Optimize and pre-compute information about the component group */
+    void finalize();
+    /** Get one of the destination tensors of this group */
+    const ITensorInfo *get_any_dst_tensor() const;
     /** Get tensor argument of this group
      *  A tensor is an argument if it is a source or destination tensor to the group
      */
     std::vector<const ITensorInfo *> get_argument_tensors() const;
     /** Get the root (first) component of this group */
     ComponentPtr get_root_component() const;
-    /** Get the last component of this group */
-    ComponentPtr get_last_component() const;
-    /** Get the previous component to the component with id @p id
-     *
-     * @param[in] id Component id of the component whose previous component is of concern
-     *
-     * @return ComponentPtr  Pointer to the previous component of the one identified by @p id
-     */
-    ComponentPtr get_previous_component(ComponentId id) const;
     /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
      *
      * An intermediate tensor is any tensor that is not an argument.
@@ -131,11 +123,12 @@ public:
     typename std::vector<ComponentPtr>::const_iterator cend() const;
 
 private:
-    std::vector<const ITensorInfo *> get_interm_tensors() const;
-
-    static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors);
-
     std::vector<ComponentPtr> _components{};
+
+    bool _finalized{ false };
+    std::vector<const ITensorInfo *> _argument_tensors{};
+    std::set<const ITensorInfo *> _interm_tensors{};
+    const ITensorInfo *_any_output_tensor{ nullptr };
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index aac84b6c59..8f4eadc477 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -44,6 +44,8 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
     // Traverse through component groups and assemble workload together
     for(auto && group : _component_groups)
     {
+        group.finalize();
+
         // Write kernel code
         GpuLogicalKernel          logical_kernel(_services, group);
         const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
index e8ef835405..7bb14c8698 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -68,12 +68,12 @@ ArgumentPack<ITensorInfo> Operator::tensors() const
     return _tensors;
 }
 
-bool GpuOperatorGroup::try_add_operator(const Operator &op) const
+bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const
 {
     const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
     const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
     // Constraint 1
-    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids))
+    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
     {
         return false;
     }
@@ -143,12 +143,12 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op) const
     }
     return true;
 }
-void GpuOperatorGroup::add_operator(const Operator &op)
+void GpuOperatorGroup::add_operator(const Operator &op, bool is_output)
 {
-    ARM_COMPUTE_ERROR_ON(!try_add_operator(op));
+    ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output));
     const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
     const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
-    _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids);
+    _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
     _operators[op.id()] = op;
 }
 Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
index 35abe6c543..308a9d796a 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -77,17 +77,19 @@ public:
     static constexpr size_t max_fused_operators = 32;
     /** Try adding (without actually adding) an operator to the group
      *
-     * @param[in] op Operator to be added
+     * @param[in] op        Operator to be added
+     * @param[in] is_output Whether this operator is the output operator.
      *
      * @return true   If @p op can be added while maintaining the invariants
      * @return false  Otherwise
      */
-    bool try_add_operator(const Operator &op) const;
+    bool try_add_operator(const Operator &op, bool is_output = false) const;
     /** Add an operator to the group
      *
-     * @param[in] op Operator to be added
+     * @param[in] op        Operator to be added
+     * @param[in] is_output Whether this operator is the output operator.
      */
-    void add_operator(const Operator &op);
+    void add_operator(const Operator &op, bool is_output = false);
     /** Create a new operator
      *
      * @param[in] operator_type @ref GpuOperatorType of the new operator
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
index 017536df6c..60c2281433 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
@@ -81,7 +81,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
 
     const auto group = sketch.implementation().operator_group();
     const auto op = group.new_operator(operator_type, tensors);
-    const auto success = group.try_add_operator(op);
+    const auto success = group.try_add_operator(op, true);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload.");
     ARM_COMPUTE_UNUSED(success);
@@ -133,7 +133,7 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch,
     tensors.add_const_tensor(ACL_DST_0, dst);
 
     const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
-    sketch.implementation().operator_group().add_operator(op);
+    sketch.implementation().operator_group().add_operator(op, true);
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index c3128ea552..8adf056912 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -125,7 +125,7 @@ TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, c
     lut["src"] = vtable.get_variable(_src);
     lut["dst"] = vtable.get_variable(_dst);
 
-    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
     lut["arg_dst"]          = dst_argument.uniq_name;
 
     // Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 1ac49406a8..6ab3a68bb0 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -137,7 +137,7 @@ TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const C
     lut["src"] = vtable.get_variable(_src);
     lut["dst"] = vtable.get_variable(_dst);
 
-    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
     lut["arg_dst"]          = dst_argument.uniq_name;
 
     // Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index 389bd5c65f..6fa77aafe3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -251,7 +251,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab
     }
     lut["dst"] = vtable.get_variable(_dst);
 
-    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
     lut["arg_dst"]          = dst_argument.uniq_name;
 
     // Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index aa324ffb54..221addb7b5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -268,7 +268,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
     }
     lut["dst"] = vtable.get_variable(_dst);
 
-    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
     lut["arg_dst"]          = dst_argument.uniq_name;
 
     // Local build options
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 6c1e0fb1de..39cec6e31c 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -194,7 +194,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
         lut["lhs"] = vtable.get_variable(_lhs);
         lut["rhs"] = vtable.get_variable(_rhs);
         lut["dst"] = vtable.get_variable(_dst);
-        lut["out"] = vtable.get_variable(comp_group.get_dst_tensors().front());
+        lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor());
     }
     else
     {
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index e4b662b3a8..ef4f2f22a1 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -84,9 +84,9 @@ TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const
     // Local build options
     lut["meta_kernel_id"]  = id();
     lut["DST_TENSOR_TYPE"] = "BUFFER";
-    const auto dst_info    = comp_group.get_dst_tensors()[0];
-    lut["DST_DATA_TYPE"]   = dst_info->data_type();
+    lut["DST_DATA_TYPE"]   = _dst->data_type();
 
+    ARM_COMPUTE_UNUSED(comp_group);
     return lut;
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index 0afd0e7581..eed481f109 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -203,9 +203,7 @@ std::string ClTemplateWriter::write_code()
 }
 std::string ClTemplateWriter::write_global_section() const
 {
-    const auto dst_tensors = _components.get_dst_tensors();
-    ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
-    const auto dst_info   = dst_tensors[0];
+    const auto dst_info   = _components.get_any_dst_tensor();
     const auto dst_w      = dst_info->dimension(0);
     const auto tile_w     = std::max(1, get_window().x().step());
     const auto tile_h     = std::max(1, get_window().y().step());