Rewrite dynamic fusion

The new version introduces the following major changes: * Change public interface to simplify and standardize the user experience - Use the term "Workload" uniformly - Simplify operator interface to be a set of static methods: validate_op(), create_op() * Separate the kernel writing into its own component (template_writer). This is to allow the co-development of GpuKernelWriter, and to allow easy replacement once GpuKernelWriter is mature. * Optimize the core fusion algorithm used by the component graph. The details can be found in GpuKernelComponentGraph::fuse() * Use Gpu instead of Cl prefixes for most of the Workload interfaces (except for runtime and kernel components, which have to be language specific) This allows the potential extension to other Gpu langauges in the future. * Refactor runtime memory interface so that auxiliary tensor handling is separate from the user tensor passing. This is because the former is less stable and may require extension in the future. * Hide source code object from the user as it is not required at the moment * Deprecate the old prototype entirely by disabling it in SCons build Resolves COMPMID-5510, COMPMID-5512, COMPMID-5513 Change-Id: If69d2362856f2de4503546b7b6cf48a525cf3079 Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8406 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2022-08-29 18:25:51 +0100
committer: SiCong Li <sicong.li@arm.com> 2022-11-01 10:38:21 +0000
commit: f44bbc5c697de841dce97c0f2fa39bae391a8174 (patch)
tree: 56468ef833726318e545043f4abcd16ad3775094 /src/dynamic_fusion/sketch/gpu
parent: 3394f3e3df7fd2d924c41822a8564493fc06473a (diff)
download: ComputeLibrary-f44bbc5c697de841dce97c0f2fa39bae391a8174.tar.gz
37 files changed, 4655 insertions, 0 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
new file mode 100644
index 0000000000..93881508bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+
+#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Services that are used throughout the creation phase of workload code
+ */
+class GpuComponentServices
+{
+public:
+    /** Default constructor */
+    GpuComponentServices() = default;
+    /** Get reference to component factory */
+    GpuKernelComponentFactory &component_factory()
+    {
+        return _comp_factory;
+    }
+
+private:
+    GpuKernelComponentFactory _comp_factory{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
new file mode 100644
index 0000000000..9cecfc2ffd
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1)
+{
+    return info0.type == info1.type;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
new file mode 100644
index 0000000000..eb36e91d48
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain information required to set up a kernel argument at run time
+ */
+struct GpuKernelArgumentInfo
+{
+    /** Enumerate all the tensor arguments variants used by all kernel implementations.  */
+    enum class Type : int
+    {
+        Scalar,
+
+        Vector,
+
+        Image,
+        Image_Reinterpret_As_3D,
+        Image_Export_To_ClImage2D,
+
+        Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+        Image_3D_Export_To_ClImage2D,
+
+        Tensor_3D,
+        Tensor_4D,
+        Tensor_4D_t_Buffer,
+        Tensor_4D_t_Image
+    };
+    /** Default constructor */
+    GpuKernelArgumentInfo() = default;
+    /** Constructor */
+    GpuKernelArgumentInfo(Type type)
+        : type{ type }
+    {
+    }
+    Type type{ Type::Tensor_4D_t_Buffer };
+};
+
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
+
+/** Kernel argument information linked with its corresponding @ref ITensorInfo
+ */
+class GpuKernelArgument
+{
+public:
+    /** Constructor
+     *
+     * @param[in] tensor_info     Associated @ref ITensorInfo
+     * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
+     */
+    GpuKernelArgument(const ITensorInfo           &tensor_info,
+                      const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{ tensor_info },
+          _kernel_arg_info{ kernel_arg_info }
+    {
+    }
+    /** Get workload tensor id */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get associated @ref ITensorInfo */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get associated @ref ITensorInfo */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get associated @ref GpuKernelArgumentInfo */
+    GpuKernelArgumentInfo *kernel_argument_info()
+    {
+        return &_kernel_arg_info;
+    }
+    /** Get associated @ref GpuKernelArgumentInfo */
+    const GpuKernelArgumentInfo *kernel_argument_info() const
+    {
+        return &_kernel_arg_info;
+    }
+    /** Check if the associated workload tensor has valid id
+     *
+     * @return true if has valid id
+     * @return false  otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    GpuKernelArgumentInfo _kernel_arg_info{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
new file mode 100644
index 0000000000..6e6422c957
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGraph.h"
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Automatically create memory descriptors for all tensors in the graph
+ *
+ * @param[in] tensors @ref ITensorInfo map
+ * @param[in] graph   @ref DependencyGraph of which the @p tensors are a part
+ *
+ * @return MemoryDescriptorMap  An assignment map of @ref MemoryDescriptors for each ITensorInfo in the graph
+ */
+MemoryDescriptorMap assign_memory_descriptors(const std::map<ITensorInfo::Id, const ITensorInfo *> tensors, const DependencyGraph &graph)
+{
+    MemoryDescriptorMap mem_map{};
+    for(auto t_id : graph.all_tensors())
+    {
+        const auto &tensor = tensors.at(t_id);
+        // Only global src and dst tensors to the entire component graph are "User" tensors, which are user-specified memories
+        if(is_in(t_id, graph.global_src_tensors()) || is_in(t_id, graph.global_dst_tensors()))
+        {
+            mem_map[t_id] = MemoryDescriptor{ MemoryType::User };
+        }
+        else
+        {
+            AuxMemoryInfo aux_mem_info{ tensor->total_size() };
+            mem_map[t_id] = MemoryDescriptor{ MemoryType::Auxiliary, aux_mem_info };
+        }
+    }
+    return mem_map;
+}
+
+} // namespace
+
+std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(
+        std::begin(tensors), std::end(tensors),
+        std::back_inserter(tensor_ids),
+        [](const auto & t)
+    {
+        return t->id();
+    });
+    return tensor_ids;
+}
+
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services)
+    : _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+{
+}
+
+GpuKernelComponentStream GpuKernelComponentGraph::fuse() const
+{
+    // Obtain memory descriptor map
+    const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph);
+    /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup
+    /// Fusion can be framed as a mathematical optimization problem:
+    /// Given fusion constraints, find the "best" fusion patterns possible
+    /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one
+    /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end
+
+    /// As the first iteration, we offer a sub-optimal algorithm here which ensures all
+    /// constraints are met, but provides no guarantee that the fusion pattern is optimal
+
+    GpuKernelComponentStream stream{ _services, mem_map };
+    // Break down into linear groups of components (constraint 1), preserving topological order
+    const auto linear_graphs = _dependency_graph.topological_partition();
+
+    // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants)
+    for(const auto &graph : linear_graphs)
+    {
+        for(unsigned int i = 0; i < graph.size(); ++i)
+        {
+            const auto comp = _components.at(graph[i].op).get();
+            // Each new linear graph signals a new component group in the stream
+            if(i == 0)
+            {
+                stream.new_component_group();
+            }
+            // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group
+            bool success = stream.add_component(comp);
+            if(!success)
+            {
+                stream.new_component_group();
+                success = stream.add_component(comp);
+                ARM_COMPUTE_ERROR_ON(!success);
+            }
+        }
+    }
+    return stream;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
new file mode 100644
index 0000000000..fbcb2c10ea
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components
+ * Its main purposes are:
+ *  - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators)
+ *  - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology
+ */
+class GpuKernelComponentGraph
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services @ref GpuComponentServices to be used by the graph
+     */
+    GpuKernelComponentGraph(GpuComponentServices *services);
+    /** Prevent instances of this class from being copy constructed */
+    GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
+    /** Prevent instances of this class from being copied */
+    GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default;
+    /** Create a new component and add it to the component graph
+     * Component id is automatically allocated
+     *
+     * @tparam T    Component type
+     * @tparam Args Component argument types
+     *
+     * @param[in] args Component arguments except for component id, which is auto-allocated
+     */
+    template <typename T, typename... Args>
+    void add_new_component(Args &&... args)
+    {
+        auto                      comp           = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors        = comp->tensors();
+        const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
+        const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
+        bool                      success        = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        ARM_COMPUTE_ERROR_ON(!success);
+        _components[comp->id()] = std::move(comp);
+        for(auto t : tensors.get_const_src_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+        for(auto t : tensors.get_const_dst_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+    }
+    /** Perform component fusion and serialize the graph into a stream of component groups
+     */
+    GpuKernelComponentStream fuse() const;
+
+private:
+    static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+    GpuComponentServices *_services;
+    std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
+    std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
+    DependencyGraph _dependency_graph{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
new file mode 100644
index 0000000000..3af4c1429d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGroup.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool GpuKernelComponentGroup::add_component(ComponentPtr component)
+{
+    // note: Constraint 1 is guaranteed as a precondition
+    // Constraint 2
+    if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable + Output)
+    if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    {
+        return false;
+    }
+    // Constraint 3.3: Disallow multiple output components
+    if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 4
+    if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    {
+        return false;
+    }
+    // Constraint 7
+    if(!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = component->tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 8
+    if(!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = component->tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 9
+    if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    {
+        return false;
+    }
+    // Constraint 9 corollary
+    if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    {
+        return false;
+    }
+    _components.push_back(component);
+    return true;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const
+{
+    if(_components.empty())
+    {
+        return {};
+    }
+    auto src_tensors     = _components[0]->tensors().get_const_src_tensors();
+    auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+    for(unsigned int i = 1; i < _components.size(); ++i)
+    {
+        auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors();
+        for(const auto src_tensor : cur_src_tensors)
+        {
+            if(src_tensor->id() == prev_dst_tensor->id())
+            {
+                continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components
+            }
+            src_tensors.push_back(src_tensor);
+        }
+        prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+    }
+
+    return src_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const
+{
+    if(_components.empty())
+    {
+        return {};
+    }
+    const auto                       dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors();
+    std::vector<const ITensorInfo *> dst_tensors;
+    for(auto tensor_ptr : dst_tensor_ptrs)
+    {
+        dst_tensors.push_back(tensor_ptr);
+    }
+    return dst_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
+{
+    std::vector<const ITensorInfo *> arguments;
+    const auto                       src_tensors = get_src_tensors();
+    const auto                       dst_tensors = get_dst_tensors();
+    arguments.reserve(src_tensors.size() + dst_tensors.size());
+    arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end());
+    arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end());
+    return arguments;
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    return _components[0];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    return _components[_components.size() - 1];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    // Get the index of the requested component
+    size_t ind = 0;
+    for(const auto c : _components)
+    {
+        if(c->id() == id)
+        {
+            break;
+        }
+        ind++;
+    }
+    if(ind == 0 || ind >= _components.size())
+    {
+        return nullptr;
+    }
+    return _components[ind - 1];
+}
+
+bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
+{
+    return is_tensor_in(tensor, get_interm_tensors());
+}
+
+size_t GpuKernelComponentGroup::size() const
+{
+    return _components.size();
+}
+bool GpuKernelComponentGroup::empty() const
+{
+    return _components.empty();
+}
+GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
+{
+    return _components[index];
+}
+const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
+{
+    return _components[index];
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
+{
+    return _components.begin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
+{
+    return _components.end();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
+{
+    return _components.cend();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
+{
+    return _components.cend();
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const
+{
+    std::vector<const ITensorInfo *> interm_tensors{};
+    for(unsigned int i = 0; i + 1 < _components.size(); ++i)
+    {
+        auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0];
+        interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component
+    }
+
+    return interm_tensors;
+}
+
+bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors)
+{
+    for(auto t : tensors)
+    {
+        if(tensor->id() == t->id())
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
new file mode 100644
index 0000000000..4c9d940594
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+
+#include "components/Types.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+/** A group of gpu kernel components to be fused together
+ * PRECONDITIONS:
+ * 1. Fusion is limited to a linear sequence of kernel components
+ * INVARIANTS:
+ * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components
+ * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components (
+ *        excluding any output or input (if any) components.
+ *        The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group
+ *    )
+ * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output?
+ * 4. All components but unfusable, have exactly 1 dst tensor
+ * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. )
+ * 6. All fused components share the same tunable parameters like tile size
+ * 7. All fused components share the same dst tensor shape
+ * 8. All fused components' tensors share the same @ref DataLayout
+ * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors
+ *      This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors
+ */
+class GpuKernelComponentGroup
+{
+public:
+    using ComponentPtr = IGpuKernelComponent *;
+    /** Maximum number of components that can be fused into the same component group
+     */
+    static constexpr size_t max_fused_components = 64;
+    /** Maximum number of dst tensors allowed for a component / component
+     */
+    static constexpr size_t max_dst_tensors = 8;
+
+public:
+    /** Default constructor */
+    GpuKernelComponentGroup() = default;
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default;
+    /** Add a component pointer into the group
+     * If the operation fails, then no change is made to the group
+     *
+     * @param[in] component Pointer to the component to be added
+     *
+     * @return true      If the operation is successful
+     * @return false     If the operation fails
+     */
+    bool add_component(ComponentPtr component);
+    /** Get source tensors of this group */
+    std::vector<const ITensorInfo *> get_src_tensors() const;
+    /** Get destination tensors of this group */
+    std::vector<const ITensorInfo *> get_dst_tensors() const;
+    /** Get tensor argument of this group
+     *  A tensor is an argument if it is a source or destination tensor to the group
+     */
+    std::vector<const ITensorInfo *> get_argument_tensors() const;
+    /** Get the root (first) component of this group */
+    ComponentPtr get_root_component() const;
+    /** Get the last component of this group */
+    ComponentPtr get_last_component() const;
+    /** Get the previous component to the component with id @p id
+     *
+     * @param[in] id Component id of the component whose previous component is of concern
+     *
+     * @return ComponentPtr  Pointer to the previous component of the one identified by @p id
+     */
+    ComponentPtr get_previous_component(ComponentId id) const;
+    /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
+     *
+     * An intermediate tensor is any tensor that is not an argument.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up
+     *
+     * @return true  If @p tensor is an intermediate tensor
+     * @return false  Otherwise
+     */
+    bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+    /** Get the number of components within the group */
+    size_t size() const;
+    /** Check if the component group is empty */
+    bool empty() const;
+    ComponentPtr &operator[](size_t index);
+    const ComponentPtr &operator[](size_t index) const;
+    typename std::vector<ComponentPtr>::iterator       begin();
+    typename std::vector<ComponentPtr>::iterator       end();
+    typename std::vector<ComponentPtr>::const_iterator begin() const;
+    typename std::vector<ComponentPtr>::const_iterator end() const;
+    typename std::vector<ComponentPtr>::const_iterator cbegin() const;
+    typename std::vector<ComponentPtr>::const_iterator cend() const;
+
+private:
+    std::vector<const ITensorInfo *> get_interm_tensors() const;
+
+    static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors);
+
+    std::vector<ComponentPtr> _components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
new file mode 100644
index 0000000000..aac84b6c59
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentStream.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
+    : _services{ services }, _component_groups{}, _mem_map{ mem_map }
+{
+}
+
+GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
+{
+    GpuWorkloadSourceCode source_code;
+    // Traverse through component groups and assemble workload together
+    for(auto && group : _component_groups)
+    {
+        // Write kernel code
+        GpuLogicalKernel          logical_kernel(_services, group);
+        const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
+        // The whole unit workload stage is determined by the root component
+        const auto unit_workload_stage = group.get_root_component()->properties().stage();
+        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map);
+    }
+    return source_code;
+}
+
+void GpuKernelComponentStream::new_component_group()
+{
+    _component_groups.emplace_back();
+}
+
+bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component)
+{
+    ARM_COMPUTE_ERROR_ON(_component_groups.empty());
+    return _component_groups.back().add_component(component);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
new file mode 100644
index 0000000000..cbaa7c297b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph
+ *  Each component group in the stream denotes a complete kernel that may consist of multiple components
+ *
+ * The main purposes of this class are:
+ *  - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream
+ *  - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode
+ */
+class GpuKernelComponentStream
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services @ref GpuComponentServices to be used throughout the stream
+     * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
+     */
+    GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default;
+    /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */
+    GpuWorkloadSourceCode write_workload_code();
+    /** Insert a new component group in the stream.
+     * Subsequent components are added to this group until end of stream or the next new_component_group is called
+     */
+    void new_component_group();
+    /** Add a component to the previously created component group
+     *  Throw an error if no component group is present in the stream
+     *
+     * @param[in] component Component to be inserted
+     *
+     * @return true      If the operation is successful
+     * @return false     Otherwise
+     */
+    bool add_component(IGpuKernelComponent *component);
+
+private:
+    GpuComponentServices                *_services;
+    std::vector<GpuKernelComponentGroup> _component_groups{};
+    MemoryDescriptorMap                  _mem_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
new file mode 100644
index 0000000000..7479328d7b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** The argument list of a @ref GpuKernelSourceCode */
+using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
+
+/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
+ */
+class GpuKernelSourceCode
+{
+public:
+    /** Set kernel name */
+    GpuKernelSourceCode &name(const std::string &n)
+    {
+        _name = n;
+        return *this;
+    }
+    /** Set kernel code */
+    GpuKernelSourceCode &code(const std::string &c)
+    {
+        _code = c;
+        return *this;
+    }
+    /** Set kernel config id string */
+    GpuKernelSourceCode &config_id(const std::string &c_id)
+    {
+        _config_id = c_id;
+        return *this;
+    }
+    /** Set kernel build options */
+    GpuKernelSourceCode &build_options(const CLBuildOptions &b_options)
+    {
+        _build_options = b_options;
+        return *this;
+    }
+    /** Set kernel execution window */
+    GpuKernelSourceCode &window(const Window &window)
+    {
+        _window = window;
+        return *this;
+    }
+    /** Set kernel argument list */
+    GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments)
+    {
+        _arguments = arguments;
+        return *this;
+    }
+    /** Get kernel name */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Get kernel code */
+    std::string code() const
+    {
+        return _code;
+    }
+    /** Get kernel config id string */
+    std::string config_id() const
+    {
+        return _config_id;
+    }
+    /** Get kernel build options */
+    const CLBuildOptions &build_options() const
+    {
+        return _build_options;
+    }
+    /** Get kernel execution window */
+    const Window &window() const
+    {
+        return _window;
+    }
+    /** Get kernel argument list */
+    const GpuKernelArgumentList &arguments() const
+    {
+        return _arguments;
+    }
+
+private:
+    std::string           _name{};
+    std::string           _code{};
+    std::string           _config_id{};
+    CLBuildOptions        _build_options{};
+    Window                _window{};
+    GpuKernelArgumentList _arguments{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
new file mode 100644
index 0000000000..7746f8bbf3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuLogicalKernel.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
+    : _services{ services }, _comp_group{ components }, _store_components{}
+{
+    add_load_store();
+}
+
+GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
+{
+    GpuKernelSourceCode code;
+    ClTemplateWriter    writer{ _comp_group };
+
+    code.name(writer.get_name());
+    code.code(writer.get_code());
+    code.arguments(writer.get_tensors());
+    code.build_options(writer.get_build_options());
+    code.config_id(writer.get_config_id());
+    code.window(writer.get_window());
+
+    return code;
+}
+
+void GpuLogicalKernel::add_load_store()
+{
+    const auto dst_tensors = _comp_group.get_dst_tensors();
+    // Each dst tensor from the component group requires exactly one store component
+    for(const auto &dst_tensor : dst_tensors)
+    {
+        ArgumentPack<ITensorInfo> tensors;
+        // Pass same destination tensor to both source and destination of the store component
+        // In other words, the addition of a store component does not create a new dst tensor
+        // This way we avoid the issue of the dst tensor of the component group differs from that of a logical kernel
+        // This may seem to violate the acyclic-ness of the component graph. But it is fine because at the point of
+        // the construction of the logical kernel, we do not need a graph representation of components anymore
+        // (the graph has been serialized)
+        tensors.add_const_tensor(ACL_SRC_0, dst_tensor);
+        tensors.add_const_tensor(ACL_DST_0, dst_tensor);
+
+        auto store = _services->component_factory().create<ClComponentStore>(
+                         _comp_group.get_root_component()->properties(), // Store component share the same properties as that of the root component
+                         tensors);
+        _store_components.push_back(std::move(store));
+        auto success = _comp_group.add_component(_store_components.back().get());
+        ARM_COMPUTE_ERROR_ON(!success); // It's guaranteed that any load store insertion should be successful
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
new file mode 100644
index 0000000000..4ce4443f60
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A wrapper-processor of a @ref GpuKernelComponentGroup
+ * It adds the load (if any) and store components to the component group
+ * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code
+ */
+class GpuLogicalKernel
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services   @ref GpuComponentServices to be used
+     * @param[in] components Component group from which this logical kernel is initialized
+     */
+    explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components);
+    /** Allow instances of this class to be copy constructed */
+    GpuLogicalKernel(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be copied */
+    GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuLogicalKernel(GpuLogicalKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default;
+    /** Generate a @ref GpuKernelSourceCode */
+    GpuKernelSourceCode write_kernel_code();
+
+private:
+    void add_load_store();
+
+    GpuComponentServices                             *_services;
+    GpuKernelComponentGroup                           _comp_group{};
+    std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
new file mode 100644
index 0000000000..e8ef835405
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(
+        std::begin(tensors), std::end(tensors),
+        std::back_inserter(tensor_ids),
+        [](const auto & t)
+    {
+        return t->id();
+    });
+    return tensor_ids;
+}
+
+} // namespace
+
+Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
+    : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+{
+}
+
+OperatorId Operator::id() const
+{
+    return _id;
+}
+
+GpuOperatorType Operator::operator_type() const
+{
+    return _operator_type;
+}
+
+ArgumentPack<ITensorInfo> Operator::tensors() const
+{
+    return _tensors;
+}
+
+bool GpuOperatorGroup::try_add_operator(const Operator &op) const
+{
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    // Constraint 1
+    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids))
+    {
+        return false;
+    }
+    // Constraint 2
+    if(_operators.size() >= max_fused_operators)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable)
+    if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    {
+        return false;
+    }
+    // Constraint 4
+    if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if(_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = op.tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 6
+    if(_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = op.tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+void GpuOperatorGroup::add_operator(const Operator &op)
+{
+    ARM_COMPUTE_ERROR_ON(!try_add_operator(op));
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids);
+    _operators[op.id()] = op;
+}
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+{
+    auto new_id = static_cast<OperatorId>(_operators.size());
+    return Operator{ new_id, operator_type, tensors };
+}
+const Operator *GpuOperatorGroup::get_root_operator() const
+{
+    const auto roots = _graph.get_root_ops();
+    ARM_COMPUTE_ERROR_ON(roots.size() > 1);
+    if(roots.empty())
+    {
+        return nullptr;
+    }
+    return &_operators.at(roots[0]);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
new file mode 100644
index 0000000000..35abe6c543
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using OperatorId = DependencyGraph::OperatorId;
+
+/** An operator for the sole purpose of validating fusion
+ */
+class Operator
+{
+public:
+    /** Default constructor */
+    Operator() = default;
+    /** Get Operator Id */
+    OperatorId id() const;
+    /** Get operator type */
+    GpuOperatorType operator_type() const;
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const;
+    friend class GpuOperatorGroup;
+
+private:
+    Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors);
+    OperatorId                _id{};
+    GpuOperatorType           _operator_type{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+
+/** A linear sequence of operators to be fused in a workload
+ *  For the time being, this class is only used for validating operator fusion
+ * INVARIANTS:
+ * @note These invariants are exactly the same as operator fusion constraints
+ * 1. Fusion is limited to a linear sequence of operators
+ * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators
+ * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable
+ * 4. All operator but unfusable, have exactly 1 dst tensor
+ * 5. All fused operators share the same dst tensor shape
+ * 6. All fused operators' tensors share the same @ref DataLayout
+ */
+class GpuOperatorGroup
+{
+public:
+    static constexpr size_t max_fused_operators = 32;
+    /** Try adding (without actually adding) an operator to the group
+     *
+     * @param[in] op Operator to be added
+     *
+     * @return true   If @p op can be added while maintaining the invariants
+     * @return false  Otherwise
+     */
+    bool try_add_operator(const Operator &op) const;
+    /** Add an operator to the group
+     *
+     * @param[in] op Operator to be added
+     */
+    void add_operator(const Operator &op);
+    /** Create a new operator
+     *
+     * @param[in] operator_type @ref GpuOperatorType of the new operator
+     * @param[in] tensors       Tensor arguments to the new operator
+     *
+     * @return Operator
+     */
+    Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const;
+    /** Get the "root operator" of the group, which is the first operator in a linear sequence
+     * @return const Operator* Pointer to the root operator
+     */
+    const Operator *get_root_operator() const;
+
+private:
+    DependencyGraph _graph{};
+    std::map<OperatorId, Operator> _operators{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
new file mode 100644
index 0000000000..c77697c343
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain properties common to all operator types */
+
+/** Operator type in the context of fusion
+ */
+enum class GpuOperatorType
+{
+    /** Simple operators are operators that:
+     *  1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise
+     *  2. Have exactly 1 output
+     */
+    Simple,
+    /** Complex operators are operators that are not simple but are still fusable with simple ones
+     */
+    Complex,
+    /** Unfusable operators are operators that cannot be fused with any other types of operators
+     */
+    Unfusable
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
new file mode 100644
index 0000000000..623bf351f8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
+    : _gpu_language{ GpuLanguage::OpenCL }, _cl_compile_ctx{ cl_compile_ctx }
+{
+}
+
+GpuTarget GpuWorkloadContext::gpu_target() const
+{
+    return _cl_compile_ctx->get_gpu_target();
+}
+
+GpuLanguage GpuWorkloadContext::gpu_language() const
+{
+    return _gpu_language;
+}
+
+const CLCompileContext *GpuWorkloadContext::cl_compile_context() const
+{
+    return _cl_compile_ctx;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
new file mode 100644
index 0000000000..ce7cf1e908
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
+    : _impl{ std::make_unique<Implementation>(context) }
+{
+}
+GpuWorkloadSketch::~GpuWorkloadSketch()
+{
+}
+
+const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const
+{
+    return _impl->context();
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info(const ITensorInfo &tensor_info)
+{
+    TensorInfo tensor{ tensor_info };
+    tensor.set_id(allocate_new_tensor_id());
+    return tensor;
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info()
+{
+    TensorInfo tensor{};
+    tensor.set_id(allocate_new_tensor_id());
+    return tensor;
+}
+
+ITensorInfo::Id GpuWorkloadSketch::allocate_new_tensor_id()
+{
+    return _impl->allocate_new_tensor_id();
+}
+
+GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation()
+{
+    return *_impl;
+}
+const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const
+{
+    return *_impl;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
new file mode 100644
index 0000000000..3997395c98
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of @ref GpuWorkloadSketch */
+class GpuWorkloadSketch::Implementation
+{
+public:
+    /** Constructor
+     *
+     * @param[in] context global workload creation context
+     */
+    explicit Implementation(
+        Context *context)
+        : _context{ context },
+          _comp_services{},
+          _component_graph{ &_comp_services },
+          _operator_group{}
+    {
+    }
+    /** Prevent instances of this class from being copy constructed */
+    Implementation(const Implementation &impl) = delete;
+    /** Prevent instances of this class from being copied */
+    Implementation &operator=(const Implementation &impl) = delete;
+    /** Allow instances of this class to be move constructed */
+    Implementation(Implementation &&impl) = default;
+    /** Allow instances of this class to be moved */
+    Implementation &operator=(Implementation &&impl) = default;
+    /** Get workload context */
+    const Context *context() const
+    {
+        return _context;
+    }
+    /** Get component graph */
+    const GpuKernelComponentGraph &component_graph() const
+    {
+        return _component_graph;
+    }
+    /** Get component graph */
+    GpuKernelComponentGraph &component_graph()
+    {
+        return _component_graph;
+    }
+    /** Get operator group */
+    const GpuOperatorGroup &operator_group() const
+    {
+        return _operator_group;
+    }
+    /** Get operator group */
+    GpuOperatorGroup &operator_group()
+    {
+        return _operator_group;
+    }
+    ITensorInfo::Id allocate_new_tensor_id()
+    {
+        return ++_next_id;
+    }
+    /** Generate @ref GpuWorkloadSourceCode from the workload sketch
+     * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
+     *
+     * @return GpuWorkloadSourceCode  The generated workload code
+     */
+    GpuWorkloadSourceCode generate_source_code() const
+    {
+        return component_graph().fuse().write_workload_code();
+    }
+
+private:
+    Context                *_context;
+    GpuComponentServices    _comp_services;
+    GpuKernelComponentGraph _component_graph;
+    GpuOperatorGroup        _operator_group;
+    ITensorInfo::Id         _next_id{ ITensorInfo::invalid_tensor_id };
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
new file mode 100644
index 0000000000..2375f5c6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
+using UnitWorkloadId = int32_t;
+
+/** Describes all the info related to a kernel in order to:
+ *  - be used by runtime to configure gpu kernel argument
+ *  - be used by memory managers to allocate required memory
+ */
+class GpuWorkloadArgument
+{
+public:
+    /** Default constructor */
+    GpuWorkloadArgument() = default;
+    /** Constructor
+     *
+     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument
+     */
+    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
+                        const MemoryDescriptor      &mem_desc,
+                        const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{ tensor_info },
+          _mem_desc{ mem_desc },
+          _kernel_arg_info{ kernel_arg_info }
+    {
+    }
+    /** Get tensor id within workload */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get @ref ITensorInfo of the argument */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref ITensorInfo of the argument */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    MemoryDescriptor *memory_descriptor()
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    const MemoryDescriptor *memory_descriptor() const
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref GpuKernelArgumentInfo of the argument */
+    GpuKernelArgumentInfo *kernel_argument_info()
+    {
+        return &_kernel_arg_info;
+    }
+    /** Get @ref GpuKernelArgumentInfo of the argument */
+    const GpuKernelArgumentInfo *kernel_argument_info() const
+    {
+        return &_kernel_arg_info;
+    }
+    /** Check if the workload argument has valid id
+     *
+     * @return true   If has valid id
+     * @return false  Otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    MemoryDescriptor      _mem_desc{};
+    GpuKernelArgumentInfo _kernel_arg_info{};
+};
+
+/** Describes when a unit workload is run.
+ */
+struct UnitWorkloadStage
+{
+    enum class Stage
+    {
+        Prepare, /**< Only run once at the beginning. */
+        Run,     /**< Run every time after the first time. */
+    };
+    Stage stage{ Stage::Run };
+};
+
+inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+{
+    return stage0.stage == stage1.stage;
+}
+
+/** The atomic unit in a Gpu workload. It contains exactly one kernel to run.
+ */
+class GpuUnitWorkload
+{
+public:
+    /** Default constructor */
+    GpuUnitWorkload() = default;
+    /** Constructor
+     *
+     * @param[in] id          Id that uniquely identifies this unit workload in a workload
+     * @param[in] kernel_code @ref GpuKernelSourceCode contained within
+     * @param[in] stage       Stage of the unit workload
+     */
+    GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
+        : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+    {
+    }
+    /** Get the id of the unit workload */
+    UnitWorkloadId id() const
+    {
+        return _id;
+    }
+    /** Get reference to the underlying @ref GpuKernelSourceCode */
+    const GpuKernelSourceCode &code() const
+    {
+        return _kernel_code;
+    }
+    /** Get the stage of the unit workload */
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadId      _id{};
+    GpuKernelSourceCode _kernel_code{};
+    UnitWorkloadStage   _stage{};
+};
+
+/** Hold the generated kernel source code and other information required to compile and run the workload.
+ */
+class GpuWorkloadSourceCode
+{
+public:
+    /** Default constructor */
+    GpuWorkloadSourceCode() = default;
+    /** Add a unit workload to the workload code
+     *
+     * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
+     * @param[in] stage       Stage of the unit workload
+     * @param[in] mem_map     @ref MemoryDescriptor map for all tensors within the unit workload
+     *
+     * @return UnitWorkloadId  Allocated unit workload id
+     */
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map)
+    {
+        // Use the size of the kernel codes as Id
+        const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
+        const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
+        _unit_workloads.push_back(unit_work);
+        // Assemble kernel argument with memory descriptor to form workload argument
+        for(const auto &id_arg : kernel_code.arguments())
+        {
+            const auto arg_id           = id_arg.first;
+            const auto arg              = id_arg.second;
+            _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
+            if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+            {
+                _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
+            }
+            _tensor_uwork_map[arg_id].insert(uwk_id);
+        }
+        return uwk_id;
+    }
+    /** Get a unit workload from its id */
+    const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const
+    {
+        ARM_COMPUTE_ERROR_ON(id < 0);
+        return _unit_workloads.at(id);
+    }
+    /** Get all unit workloads sorted in topological order */
+    std::vector<UnitWorkloadId> unit_workloads() const
+    {
+        std::vector<UnitWorkloadId> ids{};
+
+        for(const auto &uwk : _unit_workloads)
+        {
+            ids.push_back(uwk.id());
+        }
+        return ids;
+    }
+    /** Get a @ref GpuWorkloadArgument from its associated tensor id */
+    const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const
+    {
+        return &_workload_arguments.at(t_id);
+    }
+    /** Get all tensors in the entire workload */
+    std::vector<ITensorInfo::Id> tensors() const
+    {
+        std::vector<ITensorInfo::Id> ids{};
+        for(const auto &id_tensor : _workload_arguments)
+        {
+            ids.push_back(id_tensor.first);
+        }
+        return ids;
+    }
+    /** Get all unit workloads connected to the tensor with @p t_id */
+    std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const
+    {
+        const auto unit_work_set = _tensor_uwork_map.at(t_id);
+        return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end());
+    }
+
+private:
+    std::vector<GpuUnitWorkload> _unit_workloads{};
+    std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
+    std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
new file mode 100644
index 0000000000..ae67790b4b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** An interface that can write a gpu kernel
+ */
+class IGpuKernelWriter
+{
+public:
+    /** Destructor */
+    virtual ~IGpuKernelWriter()
+    {
+    }
+    /** Generate kernel name */
+    virtual std::string get_name() = 0;
+    /** Generate kernel code */
+    virtual std::string get_code() = 0;
+    /** Generate build options */
+    virtual CLBuildOptions get_build_options() = 0;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    virtual std::string get_config_id() = 0;
+    /** Generate execution window */
+    virtual Window get_window() const = 0;
+    /** Get the kernel argument lists of the kernel*/
+    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
new file mode 100644
index 0000000000..f7f0029618
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+
+#include "Types.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids
+ */
+class GpuKernelComponentFactory
+{
+public:
+    /** Create a new kernel component
+     *
+     * @tparam T      Any polymorphic type descending from @ref IGpuKernelComponent
+     * @tparam Args   Argument types to construct the kernel component
+     *
+     * @param[in] args Arguments to construct the kernel component
+     *
+     * @return std::unique_ptr<IGpuKernelComponent>
+     */
+    template <typename T, typename... Args>
+    std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+    {
+        return std::make_unique<T>(_count++, std::forward<Args>(args)...);
+    }
+
+private:
+    ComponentId _count{ 0 };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
new file mode 100644
index 0000000000..8bb19155a2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+
+#include "Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Properties common to all kernel component types */
+class KernelProperties
+{
+public:
+    KernelProperties &stage(const UnitWorkloadStage &stage)
+    {
+        _stage = stage;
+        return *this;
+    }
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadStage _stage{};
+};
+
+inline bool operator==(const KernelProperties &config0, const KernelProperties &config1)
+{
+    return config0.stage() == config1.stage();
+}
+
+/** Forward declaration */
+class IGpuTemplateComponentWriter;
+
+/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion
+ */
+class IGpuKernelComponent
+{
+public:
+    using Properties = KernelProperties;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] properties Kernel component properties
+     * @param[in] tensors    Tensor arguments to the components
+     */
+    IGpuKernelComponent(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors)
+        : _id{ id },
+          _properties{ properties },
+          _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuKernelComponent()
+    {
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get properties */
+    Properties properties() const
+    {
+        return _properties;
+    }
+    /** Get template writer for the component */
+    virtual const IGpuTemplateComponentWriter *template_writer() const = 0;
+    /** Get component type */
+    virtual GpuComponentType type() const = 0;
+
+private:
+    ComponentId               _id{ -1 };
+    Properties                _properties{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */
diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h
new file mode 100644
index 0000000000..54b3a69057
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/Types.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a kernel component within a workload
+ */
+using ComponentId = int32_t;
+
+/** Component type in the context of fusion
+ *  Its main purpose is to inform the optimizer how to perform fusion.
+ */
+enum class GpuComponentType
+{
+    Complex,
+    Simple,
+    Unfusable,
+    Output
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
new file mode 100644
index 0000000000..e94cfd1581
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using Settings = ClComponentDirectConv2dSettings;
+
+Settings &Settings::export_to_cl_image(bool cl_image)
+{
+    _export_to_cl_image = cl_image;
+    return *this;
+}
+
+bool Settings::export_to_cl_image() const
+{
+    return _export_to_cl_image;
+}
+
+Settings &Settings::fast_relaxed_math(bool fast_relaxed_math)
+{
+    _fast_relaxed_math = fast_relaxed_math;
+    return *this;
+}
+
+bool Settings::fast_relaxed_math() const
+{
+    return _fast_relaxed_math;
+}
+
+Status ClComponentDirectConv2d::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties, settings);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+    // 1. Check validity
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+    }
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+    }
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+    }
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    // wei shape is correct
+    const DataLayout data_layout = src->data_layout();
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    // dst shape is correct
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                    attributes.pad().bottom, DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+
+    // bia shape is correct
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
+                                        "Biases should be one dimensional");
+    }
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+ClComponentDirectConv2d::ClComponentDirectConv2d(
+    ComponentId                      id,
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+    : IGpuKernelComponent{ id, properties, tensors },
+      _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+{
+}
+ClComponentDirectConv2d::~ClComponentDirectConv2d()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
new file mode 100644
index 0000000000..fec22b84a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Conv2dAttributes;
+
+/** Component specific settings
+ */
+class ClComponentDirectConv2dSettings
+{
+public:
+    /** Set export_to_cl_image flag */
+    ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image);
+    /** Get export_to_cl_image flag */
+    bool export_to_cl_image() const;
+
+    /** Set fast_relaxed_math flag */
+    ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+    /** Get fast_relaxed_math flag */
+    bool fast_relaxed_math() const;
+
+private:
+    bool _export_to_cl_image{ false };
+    bool _fast_relaxed_math{ true };
+};
+
+/** Forward declaration */
+class ClTemplateDirectConv2d;
+
+class ClComponentDirectConv2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = Conv2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = ClComponentDirectConv2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_SRC_1: Weight
+     * - ACL_SRC_2: Bias (Optional)
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_SRC_2: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_SRC_2      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     */
+    ClComponentDirectConv2d(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentDirectConv2d() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
new file mode 100644
index 0000000000..f49f397ec1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentStore.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentStore::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors)
+{
+    ARM_COMPUTE_UNUSED(properties, tensors);
+    return Status{};
+}
+ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) }
+{
+}
+ClComponentStore::~ClComponentStore()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
new file mode 100644
index 0000000000..bf8c9f031e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+class ClComponentStore final : public IGpuKernelComponent
+{
+public:
+    /** Validate the component
+     *
+     * @param[in] properties Component properties
+     * @param[in] tensors    Tensor arguments to the components
+     *
+     * @return Status        Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors);
+    /** Constructor
+     *
+     * Similar to @ref ClComponentStore::validate()
+     */
+    ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+    /** Destructor */
+    ~ClComponentStore() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentStore(const ClComponentStore &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentStore &operator=(const ClComponentStore &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentStore(ClComponentStore &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentStore &operator=(ClComponentStore &&component) = default;
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Output;
+    }
+
+private:
+    std::unique_ptr<ClTemplateStore> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
new file mode 100644
index 0000000000..98c1cc3939
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout)
+{
+    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
+    {
+        return false;
+    }
+
+    // If not floating point
+    if(!is_data_type_float(tensor->data_type()))
+    {
+        return false;
+    }
+
+    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+    {
+        return false;
+    }
+
+    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+    if(!image2d_from_buffer_supported(device))
+    {
+        return false;
+    }
+
+    // Check cl image pitch alignment
+    if(get_cl_image_pitch_alignment(device) == 0)
+    {
+        return false;
+    }
+
+    const size_t image_w     = tensor->tensor_shape()[0] / 4;
+    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
+    const size_t max_image_w = device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+    const size_t max_image_h = device.getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+    if(image_w > max_image_w || image_h > max_image_h)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const ITensorInfo       *dst,
+                              const Conv2dAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        !src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id());
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+    }
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate = *dst;
+    const auto data_layout          = src->data_layout();
+
+    {
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+                                                                            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                                                          attributes.pad().right,
+                                                                                          attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate Direct Conv2d Component
+        {
+            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            auto       settings   = ClComponentDirectConv2d::Settings();
+
+            settings.export_to_cl_image(
+                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+                && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16));
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+void GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
+                          ITensorInfo            *src,
+                          ITensorInfo            *wei,
+                          ITensorInfo            *bia,
+                          ITensorInfo            *dst,
+                          const Conv2dAttributes &attributes)
+{
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst);
+    const auto data_layout = src->data_layout();
+
+    // Auto initialize dst tensor
+    {
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+                                                                            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                                                          attributes.pad().right,
+                                                                                          attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add Direct Conv2d Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+
+            auto settings = ClComponentDirectConv2d::Settings();
+
+            settings.export_to_cl_image(
+                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+                && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+
+            if(settings.export_to_cl_image())
+            {
+                arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+            }
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
new file mode 100644
index 0000000000..13c0b141a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelVariableTable.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+    // Do not re-declare if the variable associated with the tensor has already been declared
+    if(get_variable(tensor).has_valid_id())
+    {
+        ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info));
+        return;
+    }
+    // Declare variable associated with the tensor
+    std::stringstream ss;
+    ss << alias << "_t" << tensor->id();
+    const auto     uniq_name = ss.str();
+    TensorVariable var{ tensor->id(), uniq_name, argument_info };
+
+    if(is_interm)
+    {
+        _interm_var = var;
+        _interm_tensors.insert(tensor->id());
+    }
+    else
+    {
+        _vars.emplace(tensor->id(), var);
+    }
+}
+
+GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
+{
+    const TensorVariable empty_var{};
+    if(_vars.find(tensor->id()) != _vars.end())
+    {
+        return _vars.at(tensor->id());
+    }
+    if(_interm_tensors.find(tensor->id()) != _interm_tensors.end())
+    {
+        return _interm_var;
+    }
+    return empty_var;
+}
+
+GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+{
+    VariableList vars{};
+    for(const auto &tensor : tensors)
+    {
+        if(!tensor->has_valid_id())
+        {
+            continue;
+        }
+        vars.push_back(get_variable(tensor));
+    }
+    return vars;
+}
+
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
+    : value{ var.uniq_name }
+{
+}
+
+TagVal::TagVal(const std::string &val)
+    : value{ val }
+{
+}
+
+TagVal::TagVal(const char *val)
+    : value{ std::string(val) }
+{
+}
+
+TagVal::TagVal(const DataType &data_type)
+    : value{ get_cl_type_from_data_type(data_type) }
+{
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
new file mode 100644
index 0000000000..4eee3963c2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "support/Requires.h"
+#include "support/StringSupport.h"
+
+#include <set>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A table of all the variables used in the kernel
+ * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed.
+ * Each kernel has exactly one variable table
+ */
+class GpuKernelVariableTable
+{
+public:
+    /** A tensor variable whose main purposes are:
+     *  - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
+     *  - Hold the generated variable name for the associated tensor info
+     */
+    struct TensorVariable
+    {
+    public:
+        TensorVariable()                       = default;
+        TensorVariable(const TensorVariable &) = default;
+        TensorVariable       &operator=(const TensorVariable &) = default;
+        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
+        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        GpuKernelArgumentInfo kernel_argument_info{};
+        bool                  has_valid_id() const
+        {
+            return id != ITensorInfo::invalid_tensor_id;
+        }
+    };
+    using VariableList = std::vector<TensorVariable>;
+
+public:
+    /** Declare a @ref TensorVariable for a corresponding tensor info.
+     *
+     * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo
+     *        Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order
+     *
+     * @param[in] tensor        Tensor info with which the new variable is associated
+     * @param[in] argument_info Kernel argument information
+     * @param[in] is_interm     If the new variable is an intermediate variable
+     * @param[in] alias         Alias for the variable. Will be used as part of the variable name
+     */
+    void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed");
+    /** Get the @ref TensorVariable associated with @p tensor
+     *
+     * @param[in] tensor Tensor info to be queried
+     *
+     * @return TensorVariable
+     */
+    TensorVariable get_variable(const ITensorInfo *tensor) const;
+    /** Get the @ref TensorVariable list associated with @p tensors
+     * @note Empty tensors are skipped
+     *
+     * @param[in] tensors List of tensor infos to be queried
+     *
+     * @return VariableList
+     */
+    VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
+
+private:
+    std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/
+    TensorVariable            _interm_var{};           /**< Intermediate variable */
+    std::set<ITensorInfo::Id> _interm_tensors{};       /**< Tensors associated with the single intermediate variable */
+};
+
+/** A tag value will substitute a tag in a string template during its instantiation */
+struct TagVal
+{
+    /** Default constructor */
+    TagVal() = default;
+    /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
+    TagVal(const GpuKernelVariableTable::TensorVariable &var);
+    /** Construct a @ref TagVal from an integral type */
+    template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
+    TagVal(T val)
+        : value{ support::cpp11::to_string(val) }
+    {
+    }
+    /** Construct a @ref TagVal from a string */
+    TagVal(const std::string &val);
+    /** Construct a @ref TagVal from a c-style string */
+    TagVal(const char *val);
+    /** Construct a @ref TagVal from a @ref DataType */
+    TagVal(const DataType &data_type);
+    /** Get the value of the TagVal as a converted string */
+    std::string value{};
+};
+
+/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
+using Tag = std::string;
+
+/** Tag lookup table. It is used to instantiate a string template */
+using TagLUT = std::unordered_map<Tag, TagVal>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
new file mode 100644
index 0000000000..c85ddf5a2c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuKernelVariableTable;
+
+/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
+ */
+class IGpuTemplateComponentWriter
+{
+public:
+    using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> tensors)
+        : _id{ id }, _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuTemplateComponentWriter()
+    {
+    }
+    /** Generate kernel component name */
+    virtual std::string get_name() const = 0;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate additional macros used in the component */
+    virtual std::string get_additional_macros() const
+    {
+        return "";
+    }
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
+    {
+        ARM_COMPUTE_UNUSED(comp_group);
+        return CLBuildOptions{};
+    }
+    /** Generate the component config id string used for tuning */
+    virtual std::string get_config_id() const
+    {
+        return "";
+    }
+    /** Generate the header list used in the component */
+    virtual std::set<std::string> get_headers_list() const
+    {
+        return std::set<std::string> {};
+    }
+    /** Generate the execution window for the component */
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+
+private:
+    ComponentId               _id{ -1 };
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
new file mode 100644
index 0000000000..870de64eb8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateDirectConv2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
+                                               const ArgumentPack<ITensorInfo> &tensors,
+                                               const Attributes                &attributes,
+                                               const Settings                  &settings)
+    : IGpuTemplateComponentWriter{ id, tensors },
+      _src{},
+      _weight{},
+      _bias{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
+}
+
+std::string ClTemplateDirectConv2d::get_name() const
+{
+    return "direct_conv2d";
+}
+
+std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const auto k0            = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
+    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
+
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// IN_1(wei)            {{weight}}
+)_";
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+// IN_1(bia)            {{bias}}
+)_";
+    }
+    code += R"_(
+// OUT(dst, accum)      {{dst}}
+
+// Initialize the accumulators
+TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH {{WEI_WIDTH}}
+#define _IWEI_HEIGHT {{WEI_HEIGHT}}
+#define _ISRC_WIDTH {{src}}_w
+#define _ISRC_HEIGHT {{src}}_h
+#define _ISRC_CHANNELS {{src}}_c
+#define _IDST_WIDTH {{arg_dst}}_w
+#define _IDST_HEIGHT {{arg_dst}}_h
+#define _IDST_CHANNELS {{arg_dst}}_c
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
+        yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+        xi[i].v -= {{PAD_LEFT}};
+        yi[i].v -= {{PAD_TOP}};
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        {{dst}}[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+            ck += K0;
+        }
+
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+)_";
+
+    if(leftover_loop)
+    {
+        code += R"_(
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+            ++ck;
+        }
+    )_";
+}
+
+code += R"_(
+#undef _I_WEI_WIDTH
+#undef _I_WEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+
+    }
+)_";
+
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+    )_";
+}
+
+code += R"_(
+}
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+    return code;
+}
+
+void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(
+        _weight,
+        GpuKernelArgumentInfo(weight_type),
+        comp_group.is_intermediate_tensor(_weight),
+        "weight");
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        vtable.declare_variable(
+            _bias,
+            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
+            comp_group.is_intermediate_tensor(_bias),
+            "bias");
+    }
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+    // Arguments and global shared variables
+    lut["src"]    = vtable.get_variable(_src);
+    lut["weight"] = vtable.get_variable(_weight);
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        lut["bias"]          = vtable.get_variable(_bias);
+        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
+    }
+    lut["dst"] = vtable.get_variable(_dst);
+
+    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    lut["arg_dst"]          = dst_argument.uniq_name;
+
+    // Local build options
+    lut["meta_kernel_id"] = id();
+    lut["ACC_DATA_TYPE"]  = _src->data_type();
+    lut["SRC_DATA_TYPE"]  = _src->data_type();
+    lut["WEI_DATA_TYPE"]  = _weight->data_type();
+
+    lut["SRC_TENSOR_TYPE"] = "BUFFER";
+    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+    {
+        lut["WEI_TENSOR_TYPE"] = "IMAGE";
+        break;
+    }
+        default:
+    {
+        lut["WEI_TENSOR_TYPE"] = "BUFFER";
+        break;
+    }
+    }
+    const auto width_idx  = 1;
+    const auto height_idx = 2;
+    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+
+    lut["STRIDE_X"] = _attributes.stride().x();
+    lut["STRIDE_Y"] = _attributes.stride().y();
+
+    lut["PAD_LEFT"] = _attributes.pad().left;
+    lut["PAD_TOP"]  = _attributes.pad().top;
+
+    lut["ZERO_VALUE"] = 0;
+
+    return lut;
+}
+
+CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
+{
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataType     data_type   = _src->data_type();
+
+    /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
+    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0               = root_window.x().step();
+    const unsigned int m0               = root_window.y().step();
+    const unsigned int k0               = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
+    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+    CLBuildOptions build_opts{};
+    if(_settings.fast_relaxed_math())
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+    else
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    build_opts.add_option("-DIS_TILED");
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClTemplateDirectConv2d::get_config_id() const
+{
+    const DataType   data_type   = _src->data_type();
+    const DataLayout data_layout = _src->data_layout();
+
+    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const unsigned int kernel_size = _weight->dimension(width_idx);
+
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(data_type));
+    config_id += "_";
+    config_id += support::cpp11::to_string(kernel_size);
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().x());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().y());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(data_layout));
+    return config_id;
+}
+
+std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
+{
+    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+}
+
+Window ClTemplateDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto output_shape = _dst->tensor_shape();
+
+    const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
+    const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+    // Create and configure kernel window
+    Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
new file mode 100644
index 0000000000..48027a9b8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    ClTemplateDirectConv2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+    /** Generate the component config id string used for tuning */
+    std::string get_config_id() const override;
+    /** Generate the header list used in the component */
+    std::set<std::string> get_headers_list() const override;
+    /** Generate the execution window for the component */
+    Window get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_weight;
+    const ITensorInfo *_bias;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
new file mode 100644
index 0000000000..6c4b8f52f2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateStore.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+
+std::string ClTemplateStore::get_name() const
+{
+    return "store";
+}
+
+std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    return R"_(
+//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+{
+// This also follows NHWC layout
+// g_ind_0 maps to global_id(0) maps to Channel
+// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window)
+// g_ind_2 maps to global_id(2) maps to N / Batch
+#define _IDST_WIDTH {{dst}}_w
+#define _IDST_HEIGHT {{dst}}_h
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+    //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+}
+
+)_";
+}
+
+void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    // ARM_COMPUTE_UNUSED(comp_group)
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+
+    // Arguments and global shared variables
+    lut["src"] = vtable.get_variable(_src);
+    lut["dst"] = vtable.get_variable(_dst);
+
+    // Local build options
+    lut["meta_kernel_id"]  = id();
+    lut["DST_TENSOR_TYPE"] = "BUFFER";
+    const auto dst_info    = comp_group.get_dst_tensors()[0];
+    lut["DST_DATA_TYPE"]   = dst_info->data_type();
+
+    return lut;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
new file mode 100644
index 0000000000..3f97a82204
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateStore final : public IGpuTemplateComponentWriter
+{
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateStore(const ClTemplateStore &store) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateStore(ClTemplateStore &&store) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
new file mode 100644
index 0000000000..cb643a741d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateWriter.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
+std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
+{
+    std::string replaced_code    = "";
+    bool        scanning_pattern = false;
+    std::string pattern_found    = "";
+    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    {
+        if(!scanning_pattern)
+        {
+            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            {
+                i += 1;
+                scanning_pattern = true;
+                pattern_found    = "";
+            }
+            else
+            {
+                replaced_code += code_template[i];
+            }
+        }
+        else
+        {
+            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            {
+                i += 1;
+                scanning_pattern = false;
+                std::string err  = "Pattern " + pattern_found + " not found in tags";
+                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
+                replaced_code += tags.find(pattern_found)->second.value;
+            }
+            else
+            {
+                pattern_found += code_template[i];
+            }
+        }
+    }
+
+    return replaced_code;
+}
+ClTemplateWriter::~ClTemplateWriter()
+{
+}
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
+    : _components{ components }
+{
+}
+std::string ClTemplateWriter::get_name()
+{
+    return write_kernel_name();
+}
+std::string ClTemplateWriter::get_code()
+{
+    return write_code();
+}
+std::string ClTemplateWriter::get_config_id()
+{
+    std::string config_id = get_name();
+    for(const auto &comp : _components)
+    {
+        config_id += "--" + comp->template_writer()->get_config_id() + "--";
+    }
+
+    return config_id;
+}
+
+CLBuildOptions ClTemplateWriter::get_build_options()
+{
+    CLBuildOptions build_opts{};
+
+    for(const auto &comp : _components)
+    {
+        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
+    }
+
+    return build_opts;
+}
+
+Window ClTemplateWriter::get_window() const
+{
+    const auto root_comp = _components.get_root_component();
+    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+    return root_comp->template_writer()->get_window();
+}
+
+std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
+{
+    // Assemble GpuKernelArguments
+    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
+    for(const auto t : _components.get_argument_tensors())
+    {
+        tensors.emplace(
+            t->id(),
+            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+    }
+    return tensors;
+}
+
+std::string ClTemplateWriter::write_code()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
+
+    // These data structures will hold the data from all the components in the blueprint
+    std::set<std::string>    headers_list{};
+    std::set<std::string>    additional_macros{};
+    std::vector<std::string> component_codes{}; // vector because order matters
+
+    // Pass 1: Declare all kernel variables
+    for(auto &component : _components)
+    {
+        component->template_writer()->declare_variables(_vtable, _components);
+    }
+    // Pass 2: Generate component codes
+    for(auto &component : _components)
+    {
+        const auto component_writer       = component->template_writer();
+        auto       curr_headers_list      = component_writer->get_headers_list();
+        auto       curr_additional_macros = component_writer->get_additional_macros();
+        auto       curr_component_code    = component_writer->get_component_code(_components);
+        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        component_codes.push_back(replace_tags(curr_component_code, var_lut));
+
+        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
+        if(!additional_macros.empty()) // Some components might not have any
+        {
+            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
+        }
+    }
+
+    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
+    std::string code = "";
+
+    for(auto &header : headers_list)
+    {
+#if defined(EMBEDDED_KERNELS)
+        code += CLKernelLibrary::get().get_program(header).first;
+#else  // defined(EMBEDDED_KERNELS)
+        code += "#include \"" + header + "\"\n";
+#endif // defined(EMBEDDED_KERNELS)
+    }
+
+    for(auto &macros : additional_macros)
+    {
+        code += macros;
+    }
+
+    code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors()));
+
+    code += "\n{\n\n";
+
+    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
+    code += write_global_section();
+    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+
+    for(const auto &component_code : component_codes)
+    {
+        code += component_code;
+    }
+
+    code += "}\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_global_section() const
+{
+    const auto dst_tensors = _components.get_dst_tensors();
+    ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
+    const auto dst_info   = dst_tensors[0];
+    const auto dst_w      = dst_info->dimension(0);
+    const auto tile_w     = std::max(1, get_window().x().step());
+    const auto tile_h     = std::max(1, get_window().y().step());
+    auto       leftover_w = dst_w % tile_w;
+
+    std::string code = "";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
+    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
+
+    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
+    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
+{
+    std::string code;
+    switch(var.kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
+        }
+    }
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
+{
+    std::string code = "\n__kernel void " + write_kernel_name() + "(";
+
+    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    {
+        code += write_argument_declaration(argument_list[i]) + ",";
+    }
+    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    {
+        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
+    }
+
+    code += ')';
+
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_name() const
+{
+    if(_components.empty())
+    {
+        return "empty_kernel";
+    }
+    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
+    for(size_t i = 1; i < _components.size(); ++i)
+    {
+        name += "___";
+        name += _components[i]->template_writer()->get_name();
+    }
+
+    return name;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
new file mode 100644
index 0000000000..83f617b6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use a templated-string-based method to write kernel code
+ *  It stitches the component code templates together based on the valid fusion configuration.
+ *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
+ */
+class ClTemplateWriter : public IGpuKernelWriter
+{
+public:
+    /** Instantiates a kernel code string from the kernel code template
+     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
+     *
+     * @param[in] code_template Kernel code template
+     * @param[in] tags          Tag lookup table
+     *
+     * @return std::string  Instantiated kernel string
+     */
+    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
+    /** Default constructor */
+    ClTemplateWriter() = default;
+    /** Constructor
+     *
+     * @param[in] components Kernel component group from which the kernel will be generated
+     */
+    ClTemplateWriter(const GpuKernelComponentGroup &components);
+    /** Destructor */
+    ~ClTemplateWriter() override;
+    /** Generate kernel name */
+    std::string get_name() override;
+    /** Generate kernel code */
+    std::string get_code() override;
+    /** Generate build options */
+    CLBuildOptions get_build_options() override;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    std::string get_config_id() override;
+    /** Generate execution window */
+    Window get_window() const override;
+    /** Get the kernel argument lists of the kernel*/
+    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+
+private:
+    std::string write_kernel_name() const;
+    std::string write_code();
+    std::string write_global_section() const;
+    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
+    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
+
+private:
+    GpuKernelComponentGroup _components{};
+    GpuKernelVariableTable  _vtable{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */
author	SiCong Li <sicong.li@arm.com>	2022-08-29 18:25:51 +0100
committer	SiCong Li <sicong.li@arm.com>	2022-11-01 10:38:21 +0000
commit	f44bbc5c697de841dce97c0f2fa39bae391a8174 (patch)
tree	56468ef833726318e545043f4abcd16ad3775094 /src/dynamic_fusion/sketch/gpu
parent	3394f3e3df7fd2d924c41822a8564493fc06473a (diff)
download	ComputeLibrary-f44bbc5c697de841dce97c0f2fa39bae391a8174.tar.gz