16 files changed, 477 insertions, 170 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 0d2574957f..81c3f0c800 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -133,8 +133,9 @@ void GpuKernelComponentGroup::finalize()
 
     _finalized = true;
 
-    std::set<const ITensorInfo *> input_tensors;
     std::set<const ITensorInfo *> output_tensors;
+    std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
+    std::map<const ITensorInfo *, int32_t> tile_usages;
 
     for(auto component : _components)
     {
@@ -156,26 +157,139 @@ void GpuKernelComponentGroup::finalize()
             }
             else if(_interm_tensors.find(tensor) == _interm_tensors.end())
             {
-                input_tensors.insert(tensor);
+                _input_tensors.insert(tensor);
+
+                tile_usages[tensor] = 0;
+                possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
             }
         }
 
         for(auto tensor : dst_tensors)
         {
-            ARM_COMPUTE_ERROR_ON(input_tensors.find(tensor) != input_tensors.end());
+            ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
             ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
             ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
             output_tensors.insert(tensor);
+
+            tile_usages[tensor] = 0;
+            possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+        }
+
+        // Check if the output can overwrite the input tile.
+        const auto component_type = component->type();
+        if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+        {
+            ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
+
+            const auto dst_tensor = dst_tensors[0];
+            const auto &dst_shape = dst_tensor->tensor_shape();
+            const auto &dst_type = dst_tensor->data_type();
+
+            tile_usages[dst_tensor] = 0;
+
+            for(auto src_tensor : src_tensors)
+            {
+                const auto &src_shape = src_tensor->tensor_shape();
+                const auto &src_type = src_tensor->data_type();
+
+                if(src_shape == dst_shape && src_type == dst_type)
+                {
+                    const auto tile_usages_it = tile_usages.find(src_tensor);
+                    ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
+
+                    if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+                    {
+                        // Increase the number of tile usages unless this component is an output
+                        // and the tile has not been shared with any component.
+                        // (Reason: output component doesn't change the content of the tile)
+                        ++tile_usages_it->second;
+                    }
+
+                    possible_tile_map[dst_tensor].push_back(src_tensor);
+                }
+            }
+        }
+        else
+        {
+            // Outputs of complex and unfusable components need dedicated tile.
+            for(auto tensor : dst_tensors)
+            {
+                tile_usages[tensor] = 0;
+            }
+        }
+    }
+
+    // Find the smallest list of tiles that the intermediate tensors need to write to.
+    for(auto tensor : _input_tensors)
+    {
+        _tile_map[tensor] = tensor;
+    }
+
+    for(auto component : _components)
+    {
+        const auto dst_tensors = component->tensors().get_const_dst_tensors();
+
+        for(auto tensor : dst_tensors)
+        {
+            const auto target_tiles = possible_tile_map.at(tensor);
+            _tile_map[tensor] = tensor;
+
+            for(auto target : target_tiles)
+            {
+                const auto num_usage = tile_usages[target];
+
+                if(num_usage <= 1)
+                {
+                    // The target tile is consumed by only this operator, so we can reuse it
+                    // for the destination tensor data.
+                    _tile_map[tensor] = _tile_map.at(target);
+                    break;
+                }
+            }
+        }
+    }
+
+    for(auto tensor : output_tensors)
+    {
+        _tile_map[tensor] = tensor;
+    }
+
+    // All intermediate tensors that cannot be shared with any previous tensor
+    // will need to be declared as tile variable.
+    for(auto tensor_tile : _tile_map)
+    {
+        if(tensor_tile.first == tensor_tile.second &&
+           _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+        {
+            _tiles.push_back(tensor_tile.first);
         }
     }
 
     std::set_union(
-        input_tensors.begin(), input_tensors.end(),
+        _input_tensors.begin(), _input_tensors.end(),
         output_tensors.begin(), output_tensors.end(),
         std::back_inserter(_argument_tensors));
     _any_output_tensor = *output_tensors.begin();
 }
 
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _tiles;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+
+    if(_tile_map.find(tensor) != _tile_map.end())
+    {
+        return _tile_map.at(tensor);
+    }
+
+    return tensor;
+}
+
 const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
@@ -203,6 +317,12 @@ bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor)
     return _interm_tensors.find(tensor) != _interm_tensors.end();
 }
 
+bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _input_tensors.find(tensor) != _input_tensors.end();
+}
+
 size_t GpuKernelComponentGroup::size() const
 {
     return _components.size();
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
index 386aefdc05..c939aec369 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -30,6 +30,7 @@
 #include <cstdlib>
 #include <vector>
 #include <set>
+#include <map>
 
 namespace arm_compute
 {
@@ -109,6 +110,22 @@ public:
      * @return false  Otherwise
      */
     bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+    /** Check if an @ref ITensorInfo is an input tensor of the group.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up.
+     *
+     * @return true if @p tensor is an input tensor of the group, otherwise false.
+     */
+    bool is_input_tensor(const ITensorInfo *tensor) const;
+    /** Get the list of temporary tiles that need to be declared */
+    std::vector<const ITensorInfo *> get_tiles() const;
+    /** Get the shared tile that can be used to store temporary data of the specified tensor.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up.
+     *
+     * @return @ref ITensorInfo that is used to store temporary data of @p tensor.
+     **/
+    const ITensorInfo *get_tile_for_tensor(const ITensorInfo *tensor) const;
     /** Get the number of components within the group */
     size_t size() const;
     /** Check if the component group is empty */
@@ -126,9 +143,13 @@ private:
     std::vector<ComponentPtr> _components{};
 
     bool _finalized{ false };
+
     std::vector<const ITensorInfo *> _argument_tensors{};
+    std::set<const ITensorInfo *> _input_tensors{};
     std::set<const ITensorInfo *> _interm_tensors{};
     const ITensorInfo *_any_output_tensor{ nullptr };
+    std::vector<const ITensorInfo *> _tiles{};
+    std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{};
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index d0f75b1062..84d6f07f16 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -120,7 +120,7 @@ public:
     /** Get component type */
     GpuComponentType type() const override
     {
-        return GpuComponentType::Complex;
+        return GpuComponentType::Simple;
     }
 
 private:
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
index 9e5e735c60..3a5b64ad9c 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
@@ -38,7 +38,7 @@ namespace dynamic_fusion
 {
 namespace
 {
-constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 }
 Status GpuCast::is_supported_op(const GpuWorkloadContext &context,
                                 const ITensorInfo        *src,
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
index 13c0b141a5..2eafe62bfa 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -24,6 +24,7 @@
 #include "GpuKernelVariableTable.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -31,44 +32,48 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias)
+void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+
     // Do not re-declare if the variable associated with the tensor has already been declared
-    if(get_variable(tensor).has_valid_id())
+    auto it = _vars.find(tensor->id());
+
+    if(it != _vars.end())
     {
-        ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info));
+        ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info));
         return;
     }
-    // Declare variable associated with the tensor
-    std::stringstream ss;
-    ss << alias << "_t" << tensor->id();
-    const auto     uniq_name = ss.str();
-    TensorVariable var{ tensor->id(), uniq_name, argument_info };
 
-    if(is_interm)
+    const auto target = comp_group.get_tile_for_tensor(tensor);
+
+    if(target != tensor)
     {
-        _interm_var = var;
-        _interm_tensors.insert(tensor->id());
+        // If the tensor uses a shared tile, don't declare another variable.
+        it = _vars.find(target->id());
+
+        ARM_COMPUTE_ERROR_ON_MSG(
+            it == _vars.end(),
+            "The variable used for this tensor must have been declared.");
+
+        _vars[tensor->id()] = it->second;
     }
     else
     {
+        // Declare variable associated with the tensor
+        std::stringstream ss;
+        ss << alias << "_t" << tensor->id();
+        const auto     uniq_name = ss.str();
+        TensorVariable var{ tensor->id(), uniq_name, argument_info };
+
         _vars.emplace(tensor->id(), var);
     }
 }
 
 GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
 {
-    const TensorVariable empty_var{};
-    if(_vars.find(tensor->id()) != _vars.end())
-    {
-        return _vars.at(tensor->id());
-    }
-    if(_interm_tensors.find(tensor->id()) != _interm_tensors.end())
-    {
-        return _interm_var;
-    }
-    return empty_var;
+    const auto var = _vars.at(tensor->id());
+    return var;
 }
 
 GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
index 4eee3963c2..82b7513c0d 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -39,9 +39,10 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-/** A table of all the variables used in the kernel
- * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed.
- * Each kernel has exactly one variable table
+class GpuKernelComponentGroup;
+
+/** A table of all the variables used in the kernel.
+ * Each kernel has exactly one variable table.
  */
 class GpuKernelVariableTable
 {
@@ -69,15 +70,12 @@ public:
 public:
     /** Declare a @ref TensorVariable for a corresponding tensor info.
      *
-     * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo
-     *        Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order
-     *
+     * @param[in] comp_group    Component group the tensor belongs to
      * @param[in] tensor        Tensor info with which the new variable is associated
      * @param[in] argument_info Kernel argument information
-     * @param[in] is_interm     If the new variable is an intermediate variable
      * @param[in] alias         Alias for the variable. Will be used as part of the variable name
      */
-    void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed");
+    void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias = "unnamed");
     /** Get the @ref TensorVariable associated with @p tensor
      *
      * @param[in] tensor Tensor info to be queried
@@ -95,9 +93,7 @@ public:
     VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
 
 private:
-    std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/
-    TensorVariable            _interm_var{};           /**< Intermediate variable */
-    std::set<ITensorInfo::Id> _interm_tensors{};       /**< Tensors associated with the single intermediate variable */
+    std::map<ITensorInfo::Id, TensorVariable> _vars{};
 };
 
 /** A tag value will substitute a tag in a string template during its instantiation */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index 8adf056912..53e74b4187 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
@@ -67,14 +67,14 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_
 // IN(src)              {{src}}
 // OUT(dst, accum)      {{dst}}
 
-TILE({{DATA_TYPE}}, M0, N0, {{dst}});
+TILE({{DATA_TYPE}}, M0, N0, {{src}});
 TILE(uint, M0, 1, g_dst_indirect_y);
 {
     {{src}}_offset_first_element_in_bytes += g_ind_2 * {{src}}_stride_z;
 
-    T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{dst}});
+    T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{src}});
 
-    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{dst}}, {{dst}});
+    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
 }
 
 LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -91,7 +91,7 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
 // IN/OUT(src, accum)   {{src}}
 
 {
-    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{src}});
+    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
 }
 )_";
     }
@@ -104,15 +104,15 @@ LOOP_UNROLLING(int, i, 0, 1, M0,
 void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_src),
         "src");
 
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 6ab3a68bb0..dcb43f9783 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
@@ -54,20 +54,26 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group)
     ARM_COMPUTE_UNUSED(comp_group);
 
     const std::string kernel_name = get_name();
+    const auto is_root = (comp_group.get_root_component()->id() == this->id());
 
     std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+//------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
+)_";
+
+    if(is_root)
+    {
+        code += R"_(
 // IN_0(src)            {{src}}
 // OUT(dst, accum)      {{dst}}
 
-TILE({{DATA_TYPE_OUT}}, M0, N0, {{dst}});
 TILE(uint, M0, 1, g_dst_indirect_y);
 {
     {{src}}_offset_first_element_in_bytes += get_global_id(2) * {{src}}_stride_z;
 
-    TILE({{DATA_TYPE_IN}}, M0, N0, in_data);
-    T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, in_data);
+    TILE({{DATA_TYPE_IN}}, M0, N0, {{tmp}});
+    T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{tmp}});
 )_";
+    }
 
     code += R"_(
     LOOP_UNROLLING(int, m0, 0, 1, M0,
@@ -77,20 +83,20 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
     {
         code += R"_(
-    in_data[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
+    {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
 )_";
     }
 
     if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         code += R"_(
-    {{dst}}[m0].v = CONVERT_SAT(in_data[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
+    {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
 )_";
     }
     else
     {
         code += R"_(
-    {{dst}}[m0].v = CONVERT(in_data[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
+    {{dst}}[m0].v = CONVERT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
 )_";
     }
 
@@ -98,7 +104,9 @@ TILE(uint, M0, 1, g_dst_indirect_y);
     })
 )_";
 
-    code += R"_(
+    if(is_root)
+    {
+        code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
     {
         g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
@@ -106,7 +114,11 @@ TILE(uint, M0, 1, g_dst_indirect_y);
         g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
     })
 }
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+    }
+
+    code += R"_(
+//------------------ END KERNEL {{meta_kernel_id}} CAST ---------------------
 )_";
 
     return code;
@@ -115,27 +127,28 @@ TILE(uint, M0, 1, g_dst_indirect_y);
 void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_src),
         "src");
 
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
 TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    ARM_COMPUTE_UNUSED(comp_group);
+    const auto is_root = (comp_group.get_root_component()->id() == this->id());
 
     TagLUT lut{};
 
     // Arguments and global shared variables
     lut["src"] = vtable.get_variable(_src);
     lut["dst"] = vtable.get_variable(_dst);
+    lut["tmp"] = (is_root) ? lut["src"].value + "_in_data" : lut["src"];
 
     const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
     lut["arg_dst"]          = dst_argument.uniq_name;
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index 6fa77aafe3..ab7cc9f05a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
@@ -81,7 +81,6 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &
     code += R"_(
 // OUT(dst, accum)      {{dst}}
 
-TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
 TILE(uint, M0, 1, g_dst_indirect_y);
 
 {
@@ -206,9 +205,9 @@ void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable
                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
 
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(input_type),
-        comp_group.is_intermediate_tensor(_src),
         "src");
 
     const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
@@ -216,23 +215,23 @@ void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable
                                                         GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
 
     vtable.declare_variable(
+        comp_group,
         _weight,
         GpuKernelArgumentInfo(weight_type),
-        comp_group.is_intermediate_tensor(_weight),
         "weight");
 
     if(_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
         vtable.declare_variable(
+            comp_group,
             _bias,
             GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            comp_group.is_intermediate_tensor(_bias),
             "bias");
     }
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index 26399c50a9..c6e14f90c5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -86,7 +86,6 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com
     code += R"_(
 // OUT(dst, accum)      {{dst}}
 
-TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
 TILE(uint, M0, 1, g_dst_indirect_y);
 
 {
@@ -227,30 +226,30 @@ code += R"_(
 void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_src),
         "src");
 
     const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
     vtable.declare_variable(
+        comp_group,
         _weight,
         GpuKernelArgumentInfo(weight_type),
-        comp_group.is_intermediate_tensor(_weight),
         "weight");
 
     if(_bias && _bias->has_valid_id()) // optional bias
     {
         vtable.declare_variable(
+            comp_group,
             _bias,
             GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            comp_group.is_intermediate_tensor(_bias),
             "bias");
     }
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(common_tensor_type),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index 39cec6e31c..df8deee44f 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
@@ -65,94 +65,94 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup
     std::string code;
     const bool  is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape();
     const bool  is_root      = (comp_group.get_root_component()->id() == this->id());
+    const bool  is_lhs_input = comp_group.is_input_tensor(_lhs);
+    const bool  is_rhs_input = comp_group.is_input_tensor(_rhs);
 
-    if(is_root)
-    {
-        code =
+    code =
 R"_(
     //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_"
-    // IN_0(LHS)            {{lhs}}
-    // IN_1(RHS)            {{rhs}}
-    // OUT(dst, accum)      {{dst}}
-    // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
-R"_(
-    TILE({{DATA_TYPE}}, M0, N0, {{dst}});
-    TILE(uint, M0, 1, g_dst_indirect_y);
+)_";
+
+    if(is_root)
     {
-        TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
-        TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
-)_"
-        // Assuming un-collapsed window
+        code +=
 R"_(
-        {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
-        {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
-
-        T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile);
-        T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile);
+    TILE(uint, M0, 1, g_dst_indirect_y);
 )_";
-        if(is_broadcast)
-        {
-            code +=
+    }
+
+    if(is_lhs_input)
+    {
+        code +=
 R"_(
-        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+    TILE({{DATA_TYPE}}, M0, N0, {{lhs}});
 )_";
-        }
-        else
-        {
-            code +=
+    }
+
+    if(is_rhs_input)
+    {
+        code +=
 R"_(
-        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+    TILE({{DATA_TYPE}}, M0, N0, {{rhs}});
 )_";
-        }
+    }
+
     code +=
-    // Calculate the destination indirect Y
 R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
     {
-        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{out}}_w * {{out}}_h) - 1);
-        g_dst_indirect_y[i].v += g_ind_2 * (int)({{out}}_w * {{out}}_h);
-    })
-    }
-    //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
 )_";
-    }
 
-    else // non-root
+    if(is_lhs_input)
     {
-        code =
+        code +=
 R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_"
-    // IN_0/Out(Accumulator)   {{acc}}
-    // IN_1(Operand)        {{operand}}
-    // acc = operand + acc (mix-precision, broadcast, boundary aware)
+        {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z;
+        T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
+)_";
+    }
+
+    if(is_rhs_input)
+    {
+        code +=
 R"_(
+        {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z;
+        T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
+)_";
+    }
+
+    if(is_broadcast)
+    {
+        code +=
+            R"_(
+        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
+)_";
+    }
+    else
     {
-        TILE(DATA_TYPE, M0, N0, operand_tile);
-        T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile);
+        code +=
+            R"_(
+        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
 )_";
+    }
 
-        if(is_broadcast)
-        {
-            code +=
+    if(is_root)
+    {
+        // Calculate the destination indirect Y
+        code +=
 R"_(
-        T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
-)_";
-        }
-        else
+        LOOP_UNROLLING(int, i, 0, 1, M0,
         {
-            code +=
-R"_(
-        T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}});
+            g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
+            g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
+        })
 )_";
-        }
+    }
+
     code +=
 R"_(
     }
     //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
 )_";
-    }
 
     return code;
 }
@@ -160,86 +160,105 @@ R"_(
 void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _lhs,
         GpuKernelArgumentInfo(common_tensor_type),
-        comp_group.is_intermediate_tensor(_lhs),
         "lhs");
 
     vtable.declare_variable(
+        comp_group,
         _rhs,
         GpuKernelArgumentInfo(common_tensor_type),
-        comp_group.is_intermediate_tensor(_rhs),
         "rhs");
 
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(common_tensor_type),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
 TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     TagLUT             lut{};
-    const ITensorInfo *accumulator = _lhs;
-    const ITensorInfo *operand     = _rhs;
 
     // Local build options
     lut["meta_kernel_id"] = id();
     lut["DATA_TYPE"]      = get_cl_type_from_data_type(_lhs->data_type());
     // Arguments and global shared variables
-    const bool is_root = (comp_group.get_root_component()->id() == this->id());
-    if(is_root)
+
+    lut["lhs"] = vtable.get_variable(_lhs);
+    lut["rhs"] = vtable.get_variable(_rhs);
+    lut["dst"] = vtable.get_variable(_dst);
+    lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
+
+    switch(_attributes.operation())
     {
-        lut["lhs"] = vtable.get_variable(_lhs);
-        lut["rhs"] = vtable.get_variable(_rhs);
-        lut["dst"] = vtable.get_variable(_dst);
-        lut["out"] = vtable.get_variable(comp_group.get_any_dst_tensor());
+        case Attributes::ElementwiseOp::ADD:
+            lut["ELTWISE_OP"] = "ADD";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
     }
-    else
+
+    ARM_COMPUTE_ERROR_ON(
+        comp_group.is_intermediate_tensor(_lhs) &&
+        detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(
+        comp_group.is_intermediate_tensor(_rhs) &&
+        detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+
+    // Set broadcast parameters
+    // PRE: All tensors are broadcast-compatible
+    if(_lhs->tensor_shape() != _dst->tensor_shape())
     {
-        // Determine which tensor is the accumulator
-        if(comp_group.is_intermediate_tensor(_lhs))
+        const auto is_broadcast_x = _lhs->dimension(0) == 1U && _dst->dimension(0) != 1U;
+        const auto is_broadcast_y = _lhs->dimension(1) == 1U && _dst->dimension(1) != 1U;
+        const auto is_broadcast_z = _lhs->dimension(2) == 1U && _dst->dimension(2) != 1U;
+
+        // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
+        if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed lhs win [M0xN0] = [1x1]
         {
-            accumulator = _lhs;
-            operand     = _rhs;
+            lut["lhs_m0"]          = "1";
+            lut["lhs_n0"]          = "1";
+            lut["lhs_start_ind_1"] = "0";
+            lut["lhs_start_ind_0"] = "0";
         }
-        else if(comp_group.is_intermediate_tensor(_rhs))
+        else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed lhs win [M0xN0] = [1xN]
         {
-            accumulator = _rhs;
-            operand     = _lhs;
+            lut["lhs_m0"]          = "1";
+            lut["lhs_n0"]          = "N0";
+            lut["lhs_start_ind_1"] = "0";
+            lut["lhs_start_ind_0"] = "g_ind_0";
         }
         else
         {
-            ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+            ARM_COMPUTE_ERROR("Only support lhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions");
         }
-        lut["acc"]     = vtable.get_variable(accumulator);
-        lut["operand"] = vtable.get_variable(operand);
     }
-    switch(_attributes.operation())
+    else
     {
-        case Attributes::ElementwiseOp::ADD:
-            lut["ELTWISE_OP"] = "ADD";
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
+        lut["lhs_m0"]          = "M0";
+        lut["lhs_n0"]          = "N0";
+        lut["lhs_start_ind_1"] = "g_ind_1";
+        lut["lhs_start_ind_0"] = "g_ind_0";
     }
-    ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape");
-    const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape());
 
-    // Set broadcast parameters
-    // PRE: All tensors are broadcast-compatible
-    if(is_broadcast)
+    if(_rhs->tensor_shape() != _dst->tensor_shape())
     {
+        const auto is_broadcast_x = _rhs->dimension(0) == 1U && _dst->dimension(0) != 1U;
+        const auto is_broadcast_y = _rhs->dimension(1) == 1U && _dst->dimension(1) != 1U;
+        const auto is_broadcast_z = _rhs->dimension(2) == 1U && _dst->dimension(2) != 1U;
+
         // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
-        if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
+        if(is_broadcast_x && is_broadcast_y && is_broadcast_z) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
         {
             lut["rhs_m0"]          = "1";
             lut["rhs_n0"]          = "1";
             lut["rhs_start_ind_1"] = "0";
             lut["rhs_start_ind_0"] = "0";
         }
-        else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
+        else if(is_broadcast_y && is_broadcast_z) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
         {
             lut["rhs_m0"]          = "1";
             lut["rhs_n0"]          = "N0";
@@ -258,6 +277,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt
         lut["rhs_start_ind_1"] = "g_ind_1";
         lut["rhs_start_ind_0"] = "g_ind_0";
     }
+
     return lut;
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
index 05bdd27f11..8f1ed95351 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
@@ -190,21 +190,21 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component
 void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_src),
         "src");
 
     vtable.declare_variable(
+        comp_group,
         _sum,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_sum),
         "sum");
 
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
@@ -274,4 +274,4 @@ Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
 
 } // namespace dynamic_fusion
 } // namespace experimental
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index a2c04d94e5..bcb6492b43 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
@@ -54,7 +54,6 @@ std::string ClTemplateResize::get_component_code(const IGpuTemplateComponentWrit
 
     std::string code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
-TILE({{DST_DATA_TYPE}}, 1, N0, {{dst}});
 TILE(uint, 1, 1, g_dst_indirect_y);
 {
     const int yo = g_ind_2 % {{arg_dst}}_h;
@@ -180,15 +179,15 @@ TILE(uint, 1, 1, g_dst_indirect_y);
 void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_src),
         "src");
 
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index ef4f2f22a1..217214ced3 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -62,14 +62,14 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group
 void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     vtable.declare_variable(
+        comp_group,
         _src,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_src),
         "src");
     vtable.declare_variable(
+        comp_group,
         _dst,
         GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        comp_group.is_intermediate_tensor(_dst),
         "dst");
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index eed481f109..2ab6316947 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -191,6 +191,26 @@ std::string ClTemplateWriter::write_code()
     code += write_global_section();
     code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
 
+    {
+        const auto tiles = _components.get_tiles();
+        std::stringstream tiles_ss;
+
+        tiles_ss << "    //------------------ START TILE DECLARATION ---------------------\n";
+
+        for(auto tile : tiles)
+        {
+            const auto var = _vtable.get_variable(tile);
+            const auto data_type = get_cl_type_from_data_type(tile->data_type());
+            const auto var_name = var.uniq_name;
+
+            tiles_ss << "    TILE(" << data_type << ", M0, N0, " << var_name << ");\n";
+        }
+
+        tiles_ss << "    //------------------ END TILE DECLARATION ---------------------\n";
+
+        code += tiles_ss.str();
+    }
+
     for(const auto &component_code : component_codes)
     {
         code += component_code;
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index a5716ce1e1..0a689fa4b6 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -26,9 +26,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
 #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -38,6 +40,7 @@
 #include "tests/validation/reference/ConvolutionLayer.h"
 #include "tests/validation/reference/Permute.h"
 #include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/DepthConvertLayer.h"
 
 using namespace arm_compute::experimental::dynamic_fusion;
 using namespace arm_compute::test::validation::utils;
@@ -148,7 +151,7 @@ TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
     CLScheduler::get().default_reinit();
 
     const auto data_type      = DataType::F32;
-    const auto t_input_shape  = TensorShape(8, 2, 1);
+    const auto t_input_shape  = TensorShape(33, 3, 2);
 
     // Create a new workload sketch
     auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
@@ -238,6 +241,118 @@ TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
     validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_f32);
     validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_f32);
 }
+TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     *   out_0 = in_0 + in_1
+     *   out_1 = float(int32_t(out_0 + in_2))
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto t_input_shape  = TensorShape(3, 8, 5);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    auto in_0_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+    auto in_1_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+    auto in_2_info = sketch.create_tensor_info(t_input_shape, 1, data_type);
+
+    auto out_0_info = sketch.create_tensor_info();
+    auto out_1_info = sketch.create_tensor_info();
+
+    auto ans_0_info = sketch.create_tensor_info();
+    auto ans_1_info = sketch.create_tensor_info();
+    auto ans_2_info = sketch.create_tensor_info();
+    auto ans_3_info = sketch.create_tensor_info();
+
+    CastAttributes cast_0_attr;
+    cast_0_attr.data_type(DataType::S32).convert_policy(ConvertPolicy::SATURATE);
+
+    CastAttributes cast_1_attr;
+    cast_1_attr.data_type(DataType::F32).convert_policy(ConvertPolicy::SATURATE);
+
+    GpuAdd::create_op(sketch, &in_0_info, &in_1_info, &ans_0_info);
+    GpuOutput::create_op(sketch, &ans_0_info, &out_0_info);
+    GpuAdd::create_op(sketch, &ans_0_info, &in_2_info, &ans_1_info);
+    GpuCast::create_op(sketch, &ans_1_info, &ans_2_info, cast_0_attr);
+    GpuCast::create_op(sketch, &ans_2_info, &ans_3_info, cast_1_attr);
+    GpuOutput::create_op(sketch, &ans_3_info, &out_1_info);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for(auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = data.first;
+        AuxMemoryInfo aux_mem_req = data.second;
+        tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_in_0{};
+    CLTensor t_in_1{};
+    CLTensor t_in_2{};
+
+    CLTensor t_out_0{};
+    CLTensor t_out_1{};
+
+    // Initialize user tensors
+    t_in_0.allocator()->init(in_0_info);
+    t_in_1.allocator()->init(in_1_info);
+    t_in_2.allocator()->init(in_2_info);
+
+    t_out_0.allocator()->init(out_0_info);
+    t_out_1.allocator()->init(out_1_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_in_0.allocator()->allocate();
+    t_in_1.allocator()->allocate();
+    t_in_2.allocator()->allocate();
+
+    t_out_0.allocator()->allocate();
+    t_out_1.allocator()->allocate();
+
+    fill<float>(CLAccessor(t_in_0), 0, library.get());
+    fill<float>(CLAccessor(t_in_1), 1, library.get());
+    fill<float>(CLAccessor(t_in_2), 2, library.get());
+
+    // Run runtime
+    runtime.run({ &t_in_0, &t_in_1, &t_in_2, &t_out_0, &t_out_1 });
+
+    // Create reference
+    SimpleTensor<float> ref_t_in_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_in_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float> ref_t_in_2{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+    SimpleTensor<float>   ref_t_out_0{ t_input_shape, data_type, 1, QuantizationInfo() };
+    SimpleTensor<float>   ref_t_ans_1{ t_input_shape, data_type, 1, QuantizationInfo() };
+
+    // Fill reference
+    fill<float>(ref_t_in_0, 0, library.get());
+    fill<float>(ref_t_in_1, 1, library.get());
+    fill<float>(ref_t_in_2, 2, library.get());
+
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_in_0, ref_t_in_1, ref_t_out_0, ConvertPolicy::WRAP);
+    reference::arithmetic_operation(ArithmeticOperation::ADD, ref_t_out_0, ref_t_in_2, ref_t_ans_1, ConvertPolicy::WRAP);
+    const auto ref_t_ans_2 = reference::depth_convert<float, int32_t>(ref_t_ans_1, DataType::S32, ConvertPolicy::SATURATE, 0);
+    const auto ref_t_out_1 = reference::depth_convert<int32_t, float>(ref_t_ans_2, DataType::F32, ConvertPolicy::SATURATE, 0);
+
+    RelativeTolerance<float> tolerance_add_f32(0.001f);
+    AbsoluteTolerance<float> tolerance_cast_f32(1.0f);
+    validate(CLAccessor(t_out_0), ref_t_out_0, tolerance_add_f32);
+    validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
+}
 TEST_SUITE(Invalid_Fusion_Should_Fail)
 TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
 {