Rewrite dynamic fusion

The new version introduces the following major changes: * Change public interface to simplify and standardize the user experience - Use the term "Workload" uniformly - Simplify operator interface to be a set of static methods: validate_op(), create_op() * Separate the kernel writing into its own component (template_writer). This is to allow the co-development of GpuKernelWriter, and to allow easy replacement once GpuKernelWriter is mature. * Optimize the core fusion algorithm used by the component graph. The details can be found in GpuKernelComponentGraph::fuse() * Use Gpu instead of Cl prefixes for most of the Workload interfaces (except for runtime and kernel components, which have to be language specific) This allows the potential extension to other Gpu langauges in the future. * Refactor runtime memory interface so that auxiliary tensor handling is separate from the user tensor passing. This is because the former is less stable and may require extension in the future. * Hide source code object from the user as it is not required at the moment * Deprecate the old prototype entirely by disabling it in SCons build Resolves COMPMID-5510, COMPMID-5512, COMPMID-5513 Change-Id: If69d2362856f2de4503546b7b6cf48a525cf3079 Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8406 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2022-08-29 18:25:51 +0100
committer: SiCong Li <sicong.li@arm.com> 2022-11-01 10:38:21 +0000
commit: f44bbc5c697de841dce97c0f2fa39bae391a8174 (patch)
tree: 56468ef833726318e545043f4abcd16ad3775094
parent: 3394f3e3df7fd2d924c41822a8564493fc06473a (diff)
download: ComputeLibrary-f44bbc5c697de841dce97c0f2fa39bae391a8174.tar.gz
62 files changed, 7068 insertions, 46 deletions
diff --git a/Android.bp b/Android.bp
index 8c6d700062..d02d436fa0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -592,6 +592,24 @@ cc_library_static {
         "src/cpu/operators/CpuTranspose.cpp",
         "src/cpu/operators/CpuWinogradConv2d.cpp",
         "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
+        "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+        "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
+        "src/dynamic_fusion/sketch/OperatorAttributes.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
         "src/gpu/cl/ClContext.cpp",
         "src/gpu/cl/ClKernelLibrary.cpp",
         "src/gpu/cl/ClQueue.cpp",
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index e8f2ff35da..60e0f95f83 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -310,6 +310,12 @@ public:
      */
     int32_t get_ddk_version() const;
 
+    /** Return the Gpu target of the associated device
+     *
+     * @return GPUTarget
+     */
+    GPUTarget get_gpu_target() const;
+
 private:
     /** Load program and its dependencies.
      *
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index 6839d697e3..ca2837e450 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,6 +41,11 @@ class ITensorInfo : public misc::ICloneable<ITensorInfo>
 {
 public:
     using TensorDimsState = std::vector<int>;
+    /** An id that uniquely identifies an ITensorInfo within some domain (e.g. a workload)
+     */
+    using Id = int32_t;
+    /** An invalid tensor id within a domain */
+    static constexpr Id invalid_tensor_id = -1;
     /** Get the value representing dynamic dimension state
      *
      * @return Value representing dynamic dimension state
@@ -280,7 +285,20 @@ public:
     * @return A DataLayout containing the layout data information.
     */
     virtual DataLayout data_layout() const = 0;
-
+    /** Get the workload tensor id of the tensor.
+    *
+    * @return Workload tensor id of the tensor
+    */
+    virtual Id id() const = 0;
+    /** Set the tensor id
+    */
+    virtual ITensorInfo &set_id(ITensorInfo::Id id) = 0;
+    /** Check if the tensor id is valid
+     */
+    bool has_valid_id() const
+    {
+        return id() != invalid_tensor_id;
+    }
     /** If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of
      * the broadcasted valid regions of the tensors.
      *
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 54836d0528..374ea5b8c6 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -237,6 +237,17 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->data_layout();
     }
+    ITensorInfo::Id id() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->id();
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_id(id);
+        return *this;
+    }
 
 private:
     ITensorInfo    *_parent;
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 40f9ed9806..7eb8c52d07 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -50,7 +50,7 @@ public:
     /** Allow instances of this class to be copy constructed */
     TensorInfo(const ITensorInfo &info);
     /** Allow instances of this class to be copy constructed */
-    TensorInfo(const TensorInfo &) = default;
+    TensorInfo(const TensorInfo &);
     /** Allow instances of this class to be copied */
     TensorInfo &operator=(const TensorInfo &) = default;
     /** Allow instances of this class to be move constructed */
@@ -297,6 +297,15 @@ public:
         _are_values_constant = are_values_constant;
         return *this;
     }
+    ITensorInfo::Id id() const override
+    {
+        return _id;
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        _id = id;
+        return *this;
+    }
     inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
 
 private:
@@ -320,6 +329,7 @@ private:
     QuantizationInfo _quantization_info;
     DataLayout       _data_layout;
     bool             _are_values_constant;
+    ITensorInfo::Id  _id;
 };
 
 /** Check whether two tensor info are equal.
@@ -334,7 +344,8 @@ inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
     return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
            && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
            && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
-           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant);
+           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant)
+           && (lhs._id == rhs._id);
 }
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index fc6d46c53b..b0a6475527 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -782,6 +782,20 @@ private:
     DimensionRoundingType _round_type;
 };
 
+/** Padding information for 2D operations like Conv2d */
+struct Padding2D
+{
+    Padding2D() = default;
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom)
+        : left(left), right(right), top(top), bottom(bottom)
+    {
+    }
+    size_t left   = { 0 }; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = { 0 }; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = { 0 }; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = { 0 }; /**<  Padding across the height dimension on the bottom, in elements. */
+};
+
 /** Padding information for 3D operations like Conv3d */
 struct Padding3D
 {
@@ -1642,7 +1656,7 @@ public:
         LINEAR,          /**< Linear ( \f$ f(x)= ax + b \f$ ) */
         IDENTITY,        /**< Identity ( \f$ f(x)= x \f$ ) */
         HARD_SWISH,      /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
-        SWISH,            /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
+        SWISH,           /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
         GELU             /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
     };
 
diff --git a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
new file mode 100644
index 0000000000..326880f721
--- /dev/null
+++ b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME
+#define ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLTensor;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** OpenCL runtime to run a workload
+ */
+class ClWorkloadRuntime
+{
+public:
+    ClWorkloadRuntime();
+    ~ClWorkloadRuntime();
+    /** Configure @ref ClWorkloadRuntime
+     * @note A runtime cannot be re-configured
+     *
+     * @param[in] sketch @ref GpuWorkloadSketch with which to configure
+     */
+    Status configure(const GpuWorkloadSketch &sketch);
+    /** Perform run workload
+     * @note If the runtime is not configured, this method will not perform any action
+     *
+     * @param[in,out] tensors Tensors required by the run workloads
+     *
+     * @return Status If the run is successful
+     */
+    Status run(const std::vector<CLTensor *> &tensors);
+    /** Get auxiliary tensors of the workload and their memory requirement
+     */
+    std::vector<std::pair<CLTensor *, AuxMemoryInfo>> get_auxiliary_tensors();
+
+private:
+    /** Enqueue prepare workload
+     * @note If the runtime is not configured, this method will not perform any action
+     */
+    void prepare();
+    struct Implementation;
+    std::unique_ptr<Implementation> _impl;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME */
diff --git a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
new file mode 100644
index 0000000000..deedf62262
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Type of memory used by a workload tensor */
+enum class MemoryType
+{
+    User      = 0, /**< Memory coming directly from users, e.g. for argument tensors */
+    Auxiliary = 1, /**< Additional memory required by the workload tensor, e.g. for temporary tensors */
+};
+
+/** Memory information for tensors with @ref MemoryType::Auxiliary.
+ * This informs how much additional memory is required for auxiliary tensors
+ */
+struct AuxMemoryInfo
+{
+    AuxMemoryInfo() = default;
+
+    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
+        : size(size),
+          alignment(alignment)
+    {
+    }
+
+    friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
+    {
+        return info0.size == info1.size && info0.alignment == info1.alignment;
+    }
+    size_t size{ 0 };      /**< Total memory size in bytes */
+    size_t alignment{ 0 }; /**< Memory alignment in bytes */
+};
+
+/** Descriptor of a workload tensor memory */
+struct MemoryDescriptor
+{
+    MemoryType    memory_type{};     /**< Memory Type*/
+    AuxMemoryInfo aux_memory_info{}; /**< Auxiliary Tensor Memory Information */
+};
+
+/** A map from @ref ITensorInfo to their corresponding @ref MemoryDescriptor */
+using MemoryDescriptorMap = std::map<ITensorInfo::Id, MemoryDescriptor>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR */
diff --git a/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h b/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h
new file mode 100644
index 0000000000..22c6772926
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** Conv2d attributes */
+class Conv2dAttributes
+{
+public:
+    /** Set padding */
+    Conv2dAttributes &pad(const Padding2D &pad);
+    /** Get padding */
+    Padding2D pad() const;
+    /** Set stride */
+    Conv2dAttributes &stride(const Size2D &stride);
+    /** Get stride */
+    Size2D stride() const;
+    /** Set dilation */
+    Conv2dAttributes &dilation(const Size2D &dilation);
+    /** Get dilation */
+    Size2D dilation() const;
+
+private:
+    Padding2D _pad{};              /**< Padding */
+    Size2D    _stride{ 1U, 1U };   /**< Stride */
+    Size2D    _dilation{ 1U, 1U }; /**< Dilation */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
new file mode 100644
index 0000000000..1ee3c7e3ec
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT
+
+#include "arm_compute/core/GPUTarget.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLCompileContext;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Gpu Information such as the Gpu target (for example, G76) */
+using GpuTarget = ::arm_compute::GPUTarget;
+
+/** Gpu Language */
+enum class GpuLanguage
+{
+    OpenCL,
+    Unknown
+};
+/** Provide context necessary for the creation and configuration of a workload
+ * e.g. gpu targets and capabilities, cl::Device for querying OpenCl extensions. Both can affect how a kernel is generated
+ *
+ * This context is shared between different operators within a sketch, and has to stay valid for the entire workload creation session.
+ * This context may also be shared between different sketches.
+ *
+ * This class only contains information for workload creation, but not for runtime (e.g. cl::Queue for enqueueing the kernels)
+ */
+class GpuWorkloadContext
+{
+public:
+    /** Constructor */
+    GpuWorkloadContext(CLCompileContext *cl_compile_context);
+    /** Allow instances of this class to be copy constructed */
+    GpuWorkloadContext(const GpuWorkloadContext &config) = default;
+    /** Allow instances of this class to be copied */
+    GpuWorkloadContext &operator=(const GpuWorkloadContext &config) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuWorkloadContext(GpuWorkloadContext &&config) = default;
+    /** Allow instances of this class to be moved */
+    GpuWorkloadContext &operator=(GpuWorkloadContext &&config) = default;
+    /** Get @ref GpuLanguage of the context */
+    GpuLanguage gpu_language() const;
+    /** Get @ref GpuTarget of the context */
+    GpuTarget gpu_target() const;
+    /** Get @ref CLCompileContext
+     * If the gpu language is not OpenCL, then return nullptr
+     */
+    const CLCompileContext *cl_compile_context() const;
+
+private:
+    GpuLanguage       _gpu_language{ GpuLanguage::Unknown };
+    CLCompileContext *_cl_compile_ctx{ nullptr };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
new file mode 100644
index 0000000000..afbe2b8d0b
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A descriptor of a workload of operators
+ *
+ * A "workload" is a basic unit of computation to schedule and perform. It contains one or more operators that can be "fused" together.
+ * Note that a workload may still contain multiple kernels.
+ */
+class GpuWorkloadSketch
+{
+public:
+    /** Global context used for the creation of a workload */
+    using Context = GpuWorkloadContext;
+    /** Internal opaque implementation */
+    class Implementation;
+
+public:
+    /** Constructor
+     *
+     * @param[in] context Gpu context for the creation of a workload
+     */
+    explicit GpuWorkloadSketch(GpuWorkloadContext *context);
+    /** Destructor */
+    ~GpuWorkloadSketch();
+    /** Get the implementation */
+    Implementation &implementation();
+    /** Get the implementation */
+    const Implementation &implementation() const;
+    /** Get the gpu workload context of this sketch */
+    const GpuWorkloadContext *gpu_context() const;
+    /** Create a @ref TensorInfo associated with the workload sketch.
+     *
+     * @return TensorInfo   Newly created tensor info
+     */
+    template <typename... Args>
+    TensorInfo create_tensor_info(Args &&... args)
+    {
+        auto tensor_info = TensorInfo(std::forward<Args>(args)...);
+        tensor_info.set_id(allocate_new_tensor_id());
+        return tensor_info;
+    }
+    /** Create a @ref TensorInfo associated with the workload sketch by copying from an existing tensor info
+     * @note The newly copied tensor will have a different identity within the workload than the one copied from
+     *       To copy the identity of @p tensor_info as well, use @ref TensorInfo 's copy constructors instead
+     *
+     * @param[in] tensor_info @ref ITensorInfo to copy from
+     *
+     * @return TensorInfo   Newly created tensor info
+     */
+    TensorInfo create_tensor_info(const ITensorInfo &tensor_info);
+    /** Create a default @ref TensorInfo associated with the workload sketch
+     * It is usually used by a destination tensor whose @ref ITensorInfo is to be inferred automatically
+     *
+     * @return TensorInfo   Newly created tensor info
+     */
+    TensorInfo create_tensor_info();
+
+private:
+    ITensorInfo::Id                 allocate_new_tensor_id();
+    std::unique_ptr<Implementation> _impl; /**< Internal opaque implementation*/
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
new file mode 100644
index 0000000000..fe9108d356
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuConv2d final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = Conv2dAttributes;
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |wei            |bia            |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor
+     * @param[in]     wei        Weight tensor
+     * @param[in]     bia        (Optional) Bias tensor
+     * @param[out]    dst        Destination tensor
+     * @param[in]     attributes Operator attributes
+     */
+    static void create_op(GpuWorkloadSketch &sketch,
+                          ITensorInfo       *src,
+                          ITensorInfo       *wei,
+                          ITensorInfo       *bia,
+                          ITensorInfo       *dst,
+                          const Attributes &attributes);
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     * Similar to @ref GpuConv2d::create_op()
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const ITensorInfo       *dst,
+                              const Attributes        &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D */
diff --git a/examples/SConscript b/examples/SConscript
index 7d9324ca9a..fd6b591891 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -99,15 +99,6 @@ if env['opencl']:
         prog = install_bin(prog)
         alias = examples_env.Alias(example, prog)
         Default(alias)
-    if env['experimental_dynamic_fusion']:
-        examples_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
-        for file in Glob("./dynamic_fusion/*.cpp"):
-            example = os.path.basename(os.path.splitext(str(file))[0])
-            prog = examples_env.Program(example, ["./dynamic_fusion/{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs)
-            Depends(prog, arm_compute_dependency)
-            prog = install_bin(prog)
-            alias = examples_env.Alias(example, prog)
-            Default(alias)
 
 if env['gemm_tuner'] and env['opencl']:
     gemm_tuner_common_options = examples_env.Object("./gemm_tuner/CommonGemmExampleOptions.cpp")
diff --git a/filelist.json b/filelist.json
index 431979ec41..4603932638 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2115,20 +2115,25 @@
   },
   "experimental": {
     "dynamic_fusion": [
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
-      "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp",
+      "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+      "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
 
-      "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp",
-      "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp"
+      "src/dynamic_fusion/sketch/OperatorAttributes.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+      "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp"
     ]
   }
 }
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index fce8798b48..ea03d59fc2 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -232,7 +232,7 @@ void CLCompileContext::set_context(cl::Context context)
 std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
 {
     std::string concat_str;
-    bool ext_supported = false;
+    bool        ext_supported = false;
     std::string ext_buildopts;
 
 #if defined(ARM_COMPUTE_DEBUG_ENABLED)
@@ -399,4 +399,8 @@ int32_t CLCompileContext::get_ddk_version() const
 
     return -1;
 }
+GPUTarget CLCompileContext::get_gpu_target() const
+{
+    return _device.target();
+}
 } // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index d52b105507..224b68af70 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -44,7 +44,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-struct TensorBinding;
 struct ClExecutionDescriptor;
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index e441ddb3a2..12f79444c6 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ namespace arm_compute
 {
 TensorInfo::TensorInfo()
     : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true)
+      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id)
 {
 }
 
@@ -56,8 +56,28 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
     _quantization_info             = info.quantization_info();
     _data_layout                   = info.data_layout();
     _are_values_constant           = info.are_values_constant();
+    _id                            = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied
 }
 
+TensorInfo::TensorInfo(const TensorInfo &info)
+    : TensorInfo()
+{
+    _total_size                    = info.total_size();
+    _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+    _strides_in_bytes              = info.strides_in_bytes();
+    _num_channels                  = info.num_channels();
+    _tensor_shape                  = info.tensor_shape();
+    _dims_state                    = info.tensor_dims_state();
+    _data_type                     = info.data_type();
+    _format                        = info.format();
+    _is_resizable                  = info.is_resizable();
+    _valid_region                  = info.valid_region();
+    _padding                       = info.padding();
+    _quantization_info             = info.quantization_info();
+    _data_layout                   = info.data_layout();
+    _are_values_constant           = info.are_values_constant();
+    _id                            = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied
+}
 TensorInfo::TensorInfo(Format format)
     : TensorInfo(TensorShape(), format)
 {
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
new file mode 100644
index 0000000000..93fbdfed63
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClKernelRuntime.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
+
+#include "support/Cast.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using namespace arm_compute::opencl;
+
+void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
+{
+    // Create kernel from kernel source string
+    opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
+                                                                                      "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */,
+                                                                                      code.code(),
+                                                                                      klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                                                                                      code.build_options().options(),
+                                                                                      false /* Is source binary */));
+
+    // Configure execution window
+    IClKernel::configure_internal(code.window());
+
+    // Set config id for lws tuning
+    _config_id = code.config_id();
+
+    // Set kernel arguments
+    _arguments = code.arguments();
+}
+
+inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
+{
+    switch(arg.type)
+    {
+        case GpuKernelArgumentInfo::Type::Scalar:
+        {
+            ARM_COMPUTE_ERROR("Unsupported yet");
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            add_1D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            add_2D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D:
+        {
+            add_2D_tensor_argument(idx, tensor, arg_slice);
+            const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        {
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
+            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+            cl_images.push_back(tensor_image2d);
+            _kernel.setArg(idx++, tensor_image2d);
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            add_2D_tensor_argument(idx, tensor, arg_slice);
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
+            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+            cl_images.push_back(tensor_image2d);
+            _kernel.setArg(idx++, tensor_image2d);
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Tensor_3D:
+        {
+            add_3D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Tensor_4D:
+        {
+            add_4D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            add_4d_tensor_nhwc_argument(idx, tensor);
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            const size_t image_w        = tensor->info()->dimension(0) / 4;
+            const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
+            const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
+
+            cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
+                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+            cl_images.push_back(tensor_image2d);
+
+            _kernel.setArg(idx++, tensor_image2d);
+            add_4d_tensor_nhwc_argument(idx, tensor);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported");
+        }
+    }
+}
+
+void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+    // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+    Window slice_fixed_z = slice;
+    slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
+    /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
+    constexpr bool slide_along_dimz     = true;
+    constexpr bool skip_sliding_window  = false;
+    constexpr bool use_dummy_work_items = false;
+
+    unsigned int idx = 0;
+    do
+    {
+        // Set kernel arguments
+        Window arg_slice = slice;
+        // CLImages created from tensor arguments. Need to be retained until enqueue
+        std::vector<cl::Image2D> cl_images;
+        for(auto id_arg : _arguments)
+        {
+            const auto arg    = id_arg.second;
+            auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+            if(!slide_along_dimz)
+            {
+                // The stride_z for matrix must be zero if we do not slice
+                ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
+                arg_slice = slice_fixed_z;
+            }
+            add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images);
+        }
+
+        // Dispatch kernel
+        enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
+    }
+    while(skip_sliding_window && window.slide_window_slice_3D(slice));
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
new file mode 100644
index 0000000000..acc2380031
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+#define SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct GpuKernelSourceCode;
+
+/** OpenCL runtime to run a single kernel */
+class ClKernelRuntime final : public opencl::IClKernel
+{
+public:
+    /** Configure the kernel runtime
+     *
+     * @param[in] compile_ctx OpenCL compile context
+     * @param[in] code        Kernel source code
+     */
+    void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code);
+    /** Run the kernel
+     *
+     * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories
+     * @param[in]     window  Execution window
+     * @param[in]     queue   OpenCL command queue
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    /** Set a kernel tensor argument
+     *
+     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     arg       Kernel argument descriptor accompanying @p tensor
+     * @param[in]     tensor    Tensor to set as an argument of the object's kernel
+     * @param[in]     arg_slice Window the kernel will be run on
+     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+     */
+    inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+
+private:
+    GpuKernelArgumentList _arguments{}; /** All kernel arguments required by the runtime */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME */
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
new file mode 100644
index 0000000000..549c6d4abb
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
+ */
+class ClAuxTensors
+{
+public:
+    /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
+     */
+    struct DataView
+    {
+        DataView() = default;
+        DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+            : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+        {
+        }
+        ~DataView()                     = default;
+        DataView(const DataView &other) = default;
+        DataView &operator=(const DataView &other) = default;
+        DataView(DataView &&other)                 = default;
+        DataView &operator=(DataView &&other) = default;
+        CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
+        TensorInfo    tensor_info{}; /**< Associated tensor info */
+        AuxMemoryInfo memory_info{}; /**< Memory requirement */
+    };
+
+    /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
+    std::vector<DataView> get_tensors()
+    {
+        return _tensors;
+    }
+    std::vector<DataView> get_tensors() const
+    {
+        return _tensors;
+    }
+
+    friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
+
+private:
+    /** Add auxiliary tensor.
+     *
+     * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
+     * @param[in] memory_info Memory requirements of the auxiliary tensor
+     *
+     * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr
+     */
+    CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
+    {
+        const auto t_id             = tensor_info.id();
+        auto       find_tensor_pair = _owned_tensors.find(t_id);
+        if(find_tensor_pair == _owned_tensors.end())
+        {
+            return find_tensor_pair->second.get();
+        }
+        else
+        {
+            auto tensor        = std::make_unique<CLTensor>();
+            auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
+            auto new_tensor    = inserted_pair->second.get();
+            _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
+            return new_tensor;
+        }
+    }
+
+    std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
+    std::vector<DataView> _tensors{};
+};
+/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
+ *
+ * @note This is the only recommended method for user to create @ref ClAuxTensors
+ *
+ * @param[out] aux_tensors Auxiliary tensors required by the workload code
+ * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to
+ *
+ * @return Status
+ */
+Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
+{
+    for(auto t_id : code.tensors())
+    {
+        // Get tensor object
+        const auto workload_arg  = code.query_tensor(t_id);
+        ICLTensor *tensor_object = nullptr;
+        if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        {
+            // Create aux tensor CLTensor object
+            const TensorInfo tensor_info = *workload_arg->tensor_info();
+            ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
+            const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
+            tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
+        }
+        if(tensor_object == nullptr)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
+        }
+    }
+    return Status{};
+}
+
+/** A fast tensor lookup table for runtime tensor objects retrieval
+ */
+class ClTensorLUT
+{
+public:
+    /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        auto tensor_pack = _tensor_packs.find(uwk_id);
+        if(tensor_pack != _tensor_packs.end())
+        {
+            return &(tensor_pack->second);
+        }
+        return nullptr;
+    }
+    /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        return _tensor_packs.at(uwk_id);
+    }
+
+    friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);
+
+private:
+    /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack
+     * @param[in] tensor_pack Tensor pack to be added
+     */
+    void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
+    {
+        _tensor_packs[uwk_id] = tensor_pack;
+    }
+    std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
+};
+
+/** Create a fast tensor lookup table for runtime tensor retrieval
+ *
+ * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
+ * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to
+ * @param[in]  user_tensors User tensors
+ * @param[in]  aux_tensors  Auxiliary tensors required by the workload code
+ *
+ * @return Status
+ */
+Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
+{
+    // Combine user tensors and aux tensors
+    std::map<ITensorInfo::Id, CLTensor *> tensor_map;
+    for(auto tensor : user_tensors)
+    {
+        const auto t_id = tensor->info()->id();
+        if(tensor_map.find(t_id) != tensor_map.end())
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+        }
+        tensor_map[t_id] = tensor;
+    }
+    for(const auto &data : aux_tensors.get_tensors())
+    {
+        const auto t_id   = data.tensor_info.id();
+        const auto tensor = data.tensor;
+        if(tensor_map.find(t_id) != tensor_map.end())
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+        }
+        tensor_map[t_id] = tensor;
+    }
+
+    // Add tensor objects into corresponding tensor packs
+    for(auto id_tensor : tensor_map)
+    {
+        const auto t_id          = id_tensor.first;
+        const auto tensor_object = id_tensor.second;
+        if(tensor_object == nullptr)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+        }
+        if(tensor_object->allocator()->info().total_size() == 0U)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
+        }
+
+        for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        {
+            ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
+            if(tensor_pack == nullptr)
+            {
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
+            }
+            else
+            {
+                tensor_pack->add_tensor(t_id, tensor_object);
+            }
+        }
+    }
+    return Status{};
+}
+
+} // namespace
+
+struct ClWorkloadRuntime::Implementation
+{
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
+    bool                  _is_configured{ false };
+    bool                  _is_prepared{ false };
+    ClTensorLUT           _tensor_lut{};
+    ClAuxTensors          _aux_tensors{};
+    GpuWorkloadSourceCode _source_code{};
+};
+
+ClWorkloadRuntime::ClWorkloadRuntime()
+    : _impl{ std::make_unique<Implementation>() }
+{
+}
+
+ClWorkloadRuntime::~ClWorkloadRuntime() = default;
+
+Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    // Generate source code
+    _impl->_source_code = sketch.implementation().generate_source_code();
+    // Configure unit workload from source code
+    for(auto uwk_id : _impl->_source_code.unit_workloads())
+    {
+        const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
+        const auto stage = work.stage().stage;
+        auto       k     = std::make_unique<ClKernelRuntime>();
+        k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
+
+        switch(stage)
+        {
+            case UnitWorkloadStage::Stage::Run:
+                _impl->_kernels.emplace(work.id(), std::move(k));
+                break;
+            case UnitWorkloadStage::Stage::Prepare:
+                _impl->_kernels_prep.emplace(work.id(), std::move(k));
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid unit workload stage");
+        }
+        break;
+    }
+    // Create auxiliary tensor objects
+    create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
+    _impl->_is_configured = true;
+    return Status{};
+}
+
+void ClWorkloadRuntime::prepare()
+{
+    if(!_impl->_is_prepared)
+    {
+        for(auto &id_kernel_pair : _impl->_kernels_prep)
+        {
+            const bool flush_queue = false;
+            const auto uwk_id      = id_kernel_pair.first;
+            auto       kernel      = id_kernel_pair.second.get();
+            CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+        }
+
+        _impl->_is_prepared = true;
+    }
+}
+
+Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
+{
+    // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
+    // in which case the lut can be cached during prepare
+    const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    prepare();
+    for(auto &id_kernel_pair : _impl->_kernels)
+    {
+        // Flush the command queue on the last kernel
+        const bool flush_queue = false;
+        const auto uwk_id      = id_kernel_pair.first;
+        auto       kernel      = id_kernel_pair.second.get();
+        CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+    }
+    return Status{};
+}
+
+std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
+{
+    std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
+    for(const auto &data : _impl->_aux_tensors.get_tensors())
+    {
+        aux_tensors.emplace_back(data.tensor, data.memory_info);
+    }
+    return aux_tensors;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
new file mode 100644
index 0000000000..f118d7d851
--- /dev/null
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+#define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+
+#include "arm_compute/core/experimental/Types.h"
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types
+ * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor
+ *
+ * The argument id is the position of the argument within the pack, and is represented by @ref TensorType
+ *
+ * @tparam T Tensor-related type
+ */
+template <typename T>
+class ArgumentPack
+{
+public:
+    /** @ref TensorType encodes the position of a tensor argument within the pack */
+    using Id = TensorType;
+    /** A single argument element within the pack
+     * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time
+     */
+    struct PackElement
+    {
+        PackElement()                        = default;
+        PackElement(const PackElement &elem) = default;
+        PackElement &operator=(const PackElement &elem) = default;
+        PackElement(PackElement &&elem)                 = default;
+        PackElement &operator=(PackElement &&elem) = default;
+        PackElement(Id id, T *tensor)
+            : id(id), tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(Id id, const T *ctensor)
+            : id(id), tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        Id       id{ ACL_UNKNOWN }; /**< Argument id within the pack */
+        T       *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */
+        const T *ctensor
+        {
+            nullptr
+        }; /**< Const pointer to tensor-related object */
+    };
+
+public:
+    /** Default constructor */
+    ArgumentPack() = default;
+    /** Destructor */
+    ~ArgumentPack() = default;
+    /** Allow instances of this class to be copy constructed */
+    ArgumentPack<T>(const ArgumentPack<T> &other) = default;
+    /** Allow instances of this class to be copied */
+    ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default;
+    /** Allow instances of this class to be move constructed */
+    ArgumentPack<T>(ArgumentPack<T> &&other) = default;
+    /** Allow instances of this class to be moved */
+    ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
+    /** Initializer list Constructor */
+    ArgumentPack(const std::initializer_list<PackElement> &l)
+        : _pack{}
+    {
+        for(const auto &e : l)
+        {
+            _pack[e.id] = e;
+        }
+    }
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(Id id, T *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_const_tensor(Id id, const T *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Get tensor of a given id from the pack
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    T *get_tensor(Id id)
+    {
+        auto it = _pack.find(id);
+        return it != _pack.end() ? it->second.tensor : nullptr;
+    }
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor (const or not) if exist else nullptr
+     */
+    const T *get_const_tensor(Id id) const
+    {
+        auto it = _pack.find(id);
+        if(it != _pack.end())
+        {
+            return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+        }
+        return nullptr;
+    }
+    /** Remove the tensor stored with the given id
+     *
+     * @param[in] id ID of tensor to remove
+     */
+    void remove_tensor(Id id)
+    {
+        _pack.erase(id);
+    }
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const
+    {
+        return _pack.size();
+    }
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const
+    {
+        return _pack.empty();
+    }
+    /** Get the ACL_SRC_* tensors
+     *
+     * @return std::vector<T *>
+     */
+    std::vector<T *> get_src_tensors()
+    {
+        std::vector<T *> src_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the const ACL_SRC_* tensors
+     *
+     * @return std::vector<const T *>
+     */
+    std::vector<const T *> get_const_src_tensors() const
+    {
+        std::vector<const T *> src_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_const_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the ACL_DST_* tensors
+     *
+     * @return std::vector<T *>
+     */
+    std::vector<T *> get_dst_tensors()
+    {
+        std::vector<T *> dst_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+    /** Get the const ACL_DST_* tensors
+     *
+     * @return std::vector<const T *>
+     */
+    std::vector<const T *> get_const_dst_tensors() const
+    {
+        std::vector<const T *> dst_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_const_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+
+private:
+    std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK */
diff --git a/src/dynamic_fusion/sketch/OperatorAttributes.cpp b/src/dynamic_fusion/sketch/OperatorAttributes.cpp
new file mode 100644
index 0000000000..51ec444587
--- /dev/null
+++ b/src/dynamic_fusion/sketch/OperatorAttributes.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad)
+{
+    _pad = pad;
+    return *this;
+}
+Padding2D Conv2dAttributes::pad() const
+{
+    return _pad;
+}
+Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride)
+{
+    _stride = stride;
+    return *this;
+}
+Size2D Conv2dAttributes::stride() const
+{
+    return _stride;
+}
+Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation)
+{
+    _dilation = dilation;
+    return *this;
+}
+Size2D Conv2dAttributes::dilation() const
+{
+    return _dilation;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
new file mode 100644
index 0000000000..93881508bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+
+#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Services that are used throughout the creation phase of workload code
+ */
+class GpuComponentServices
+{
+public:
+    /** Default constructor */
+    GpuComponentServices() = default;
+    /** Get reference to component factory */
+    GpuKernelComponentFactory &component_factory()
+    {
+        return _comp_factory;
+    }
+
+private:
+    GpuKernelComponentFactory _comp_factory{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
new file mode 100644
index 0000000000..9cecfc2ffd
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1)
+{
+    return info0.type == info1.type;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
new file mode 100644
index 0000000000..eb36e91d48
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain information required to set up a kernel argument at run time
+ */
+struct GpuKernelArgumentInfo
+{
+    /** Enumerate all the tensor arguments variants used by all kernel implementations.  */
+    enum class Type : int
+    {
+        Scalar,
+
+        Vector,
+
+        Image,
+        Image_Reinterpret_As_3D,
+        Image_Export_To_ClImage2D,
+
+        Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+        Image_3D_Export_To_ClImage2D,
+
+        Tensor_3D,
+        Tensor_4D,
+        Tensor_4D_t_Buffer,
+        Tensor_4D_t_Image
+    };
+    /** Default constructor */
+    GpuKernelArgumentInfo() = default;
+    /** Constructor */
+    GpuKernelArgumentInfo(Type type)
+        : type{ type }
+    {
+    }
+    Type type{ Type::Tensor_4D_t_Buffer };
+};
+
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
+
+/** Kernel argument information linked with its corresponding @ref ITensorInfo
+ */
+class GpuKernelArgument
+{
+public:
+    /** Constructor
+     *
+     * @param[in] tensor_info     Associated @ref ITensorInfo
+     * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
+     */
+    GpuKernelArgument(const ITensorInfo           &tensor_info,
+                      const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{ tensor_info },
+          _kernel_arg_info{ kernel_arg_info }
+    {
+    }
+    /** Get workload tensor id */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get associated @ref ITensorInfo */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get associated @ref ITensorInfo */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get associated @ref GpuKernelArgumentInfo */
+    GpuKernelArgumentInfo *kernel_argument_info()
+    {
+        return &_kernel_arg_info;
+    }
+    /** Get associated @ref GpuKernelArgumentInfo */
+    const GpuKernelArgumentInfo *kernel_argument_info() const
+    {
+        return &_kernel_arg_info;
+    }
+    /** Check if the associated workload tensor has valid id
+     *
+     * @return true if has valid id
+     * @return false  otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    GpuKernelArgumentInfo _kernel_arg_info{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
new file mode 100644
index 0000000000..6e6422c957
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGraph.h"
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Automatically create memory descriptors for all tensors in the graph
+ *
+ * @param[in] tensors @ref ITensorInfo map
+ * @param[in] graph   @ref DependencyGraph of which the @p tensors are a part
+ *
+ * @return MemoryDescriptorMap  An assignment map of @ref MemoryDescriptors for each ITensorInfo in the graph
+ */
+MemoryDescriptorMap assign_memory_descriptors(const std::map<ITensorInfo::Id, const ITensorInfo *> tensors, const DependencyGraph &graph)
+{
+    MemoryDescriptorMap mem_map{};
+    for(auto t_id : graph.all_tensors())
+    {
+        const auto &tensor = tensors.at(t_id);
+        // Only global src and dst tensors to the entire component graph are "User" tensors, which are user-specified memories
+        if(is_in(t_id, graph.global_src_tensors()) || is_in(t_id, graph.global_dst_tensors()))
+        {
+            mem_map[t_id] = MemoryDescriptor{ MemoryType::User };
+        }
+        else
+        {
+            AuxMemoryInfo aux_mem_info{ tensor->total_size() };
+            mem_map[t_id] = MemoryDescriptor{ MemoryType::Auxiliary, aux_mem_info };
+        }
+    }
+    return mem_map;
+}
+
+} // namespace
+
+std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(
+        std::begin(tensors), std::end(tensors),
+        std::back_inserter(tensor_ids),
+        [](const auto & t)
+    {
+        return t->id();
+    });
+    return tensor_ids;
+}
+
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services)
+    : _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+{
+}
+
+GpuKernelComponentStream GpuKernelComponentGraph::fuse() const
+{
+    // Obtain memory descriptor map
+    const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph);
+    /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup
+    /// Fusion can be framed as a mathematical optimization problem:
+    /// Given fusion constraints, find the "best" fusion patterns possible
+    /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one
+    /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end
+
+    /// As the first iteration, we offer a sub-optimal algorithm here which ensures all
+    /// constraints are met, but provides no guarantee that the fusion pattern is optimal
+
+    GpuKernelComponentStream stream{ _services, mem_map };
+    // Break down into linear groups of components (constraint 1), preserving topological order
+    const auto linear_graphs = _dependency_graph.topological_partition();
+
+    // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants)
+    for(const auto &graph : linear_graphs)
+    {
+        for(unsigned int i = 0; i < graph.size(); ++i)
+        {
+            const auto comp = _components.at(graph[i].op).get();
+            // Each new linear graph signals a new component group in the stream
+            if(i == 0)
+            {
+                stream.new_component_group();
+            }
+            // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group
+            bool success = stream.add_component(comp);
+            if(!success)
+            {
+                stream.new_component_group();
+                success = stream.add_component(comp);
+                ARM_COMPUTE_ERROR_ON(!success);
+            }
+        }
+    }
+    return stream;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
new file mode 100644
index 0000000000..fbcb2c10ea
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components
+ * Its main purposes are:
+ *  - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators)
+ *  - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology
+ */
+class GpuKernelComponentGraph
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services @ref GpuComponentServices to be used by the graph
+     */
+    GpuKernelComponentGraph(GpuComponentServices *services);
+    /** Prevent instances of this class from being copy constructed */
+    GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
+    /** Prevent instances of this class from being copied */
+    GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default;
+    /** Create a new component and add it to the component graph
+     * Component id is automatically allocated
+     *
+     * @tparam T    Component type
+     * @tparam Args Component argument types
+     *
+     * @param[in] args Component arguments except for component id, which is auto-allocated
+     */
+    template <typename T, typename... Args>
+    void add_new_component(Args &&... args)
+    {
+        auto                      comp           = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors        = comp->tensors();
+        const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
+        const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
+        bool                      success        = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        ARM_COMPUTE_ERROR_ON(!success);
+        _components[comp->id()] = std::move(comp);
+        for(auto t : tensors.get_const_src_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+        for(auto t : tensors.get_const_dst_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+    }
+    /** Perform component fusion and serialize the graph into a stream of component groups
+     */
+    GpuKernelComponentStream fuse() const;
+
+private:
+    static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+    GpuComponentServices *_services;
+    std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
+    std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
+    DependencyGraph _dependency_graph{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
new file mode 100644
index 0000000000..3af4c1429d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGroup.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool GpuKernelComponentGroup::add_component(ComponentPtr component)
+{
+    // note: Constraint 1 is guaranteed as a precondition
+    // Constraint 2
+    if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable + Output)
+    if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    {
+        return false;
+    }
+    // Constraint 3.3: Disallow multiple output components
+    if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 4
+    if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    {
+        return false;
+    }
+    // Constraint 7
+    if(!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = component->tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 8
+    if(!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = component->tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 9
+    if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    {
+        return false;
+    }
+    // Constraint 9 corollary
+    if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    {
+        return false;
+    }
+    _components.push_back(component);
+    return true;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const
+{
+    if(_components.empty())
+    {
+        return {};
+    }
+    auto src_tensors     = _components[0]->tensors().get_const_src_tensors();
+    auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+    for(unsigned int i = 1; i < _components.size(); ++i)
+    {
+        auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors();
+        for(const auto src_tensor : cur_src_tensors)
+        {
+            if(src_tensor->id() == prev_dst_tensor->id())
+            {
+                continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components
+            }
+            src_tensors.push_back(src_tensor);
+        }
+        prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+    }
+
+    return src_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const
+{
+    if(_components.empty())
+    {
+        return {};
+    }
+    const auto                       dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors();
+    std::vector<const ITensorInfo *> dst_tensors;
+    for(auto tensor_ptr : dst_tensor_ptrs)
+    {
+        dst_tensors.push_back(tensor_ptr);
+    }
+    return dst_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
+{
+    std::vector<const ITensorInfo *> arguments;
+    const auto                       src_tensors = get_src_tensors();
+    const auto                       dst_tensors = get_dst_tensors();
+    arguments.reserve(src_tensors.size() + dst_tensors.size());
+    arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end());
+    arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end());
+    return arguments;
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    return _components[0];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    return _components[_components.size() - 1];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    // Get the index of the requested component
+    size_t ind = 0;
+    for(const auto c : _components)
+    {
+        if(c->id() == id)
+        {
+            break;
+        }
+        ind++;
+    }
+    if(ind == 0 || ind >= _components.size())
+    {
+        return nullptr;
+    }
+    return _components[ind - 1];
+}
+
+bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
+{
+    return is_tensor_in(tensor, get_interm_tensors());
+}
+
+size_t GpuKernelComponentGroup::size() const
+{
+    return _components.size();
+}
+bool GpuKernelComponentGroup::empty() const
+{
+    return _components.empty();
+}
+GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
+{
+    return _components[index];
+}
+const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
+{
+    return _components[index];
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
+{
+    return _components.begin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
+{
+    return _components.end();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
+{
+    return _components.cend();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
+{
+    return _components.cend();
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const
+{
+    std::vector<const ITensorInfo *> interm_tensors{};
+    for(unsigned int i = 0; i + 1 < _components.size(); ++i)
+    {
+        auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0];
+        interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component
+    }
+
+    return interm_tensors;
+}
+
+bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors)
+{
+    for(auto t : tensors)
+    {
+        if(tensor->id() == t->id())
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
new file mode 100644
index 0000000000..4c9d940594
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+
+#include "components/Types.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+/** A group of gpu kernel components to be fused together
+ * PRECONDITIONS:
+ * 1. Fusion is limited to a linear sequence of kernel components
+ * INVARIANTS:
+ * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components
+ * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components (
+ *        excluding any output or input (if any) components.
+ *        The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group
+ *    )
+ * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output?
+ * 4. All components but unfusable, have exactly 1 dst tensor
+ * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. )
+ * 6. All fused components share the same tunable parameters like tile size
+ * 7. All fused components share the same dst tensor shape
+ * 8. All fused components' tensors share the same @ref DataLayout
+ * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors
+ *      This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors
+ */
+class GpuKernelComponentGroup
+{
+public:
+    using ComponentPtr = IGpuKernelComponent *;
+    /** Maximum number of components that can be fused into the same component group
+     */
+    static constexpr size_t max_fused_components = 64;
+    /** Maximum number of dst tensors allowed for a component / component
+     */
+    static constexpr size_t max_dst_tensors = 8;
+
+public:
+    /** Default constructor */
+    GpuKernelComponentGroup() = default;
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default;
+    /** Add a component pointer into the group
+     * If the operation fails, then no change is made to the group
+     *
+     * @param[in] component Pointer to the component to be added
+     *
+     * @return true      If the operation is successful
+     * @return false     If the operation fails
+     */
+    bool add_component(ComponentPtr component);
+    /** Get source tensors of this group */
+    std::vector<const ITensorInfo *> get_src_tensors() const;
+    /** Get destination tensors of this group */
+    std::vector<const ITensorInfo *> get_dst_tensors() const;
+    /** Get tensor argument of this group
+     *  A tensor is an argument if it is a source or destination tensor to the group
+     */
+    std::vector<const ITensorInfo *> get_argument_tensors() const;
+    /** Get the root (first) component of this group */
+    ComponentPtr get_root_component() const;
+    /** Get the last component of this group */
+    ComponentPtr get_last_component() const;
+    /** Get the previous component to the component with id @p id
+     *
+     * @param[in] id Component id of the component whose previous component is of concern
+     *
+     * @return ComponentPtr  Pointer to the previous component of the one identified by @p id
+     */
+    ComponentPtr get_previous_component(ComponentId id) const;
+    /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
+     *
+     * An intermediate tensor is any tensor that is not an argument.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up
+     *
+     * @return true  If @p tensor is an intermediate tensor
+     * @return false  Otherwise
+     */
+    bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+    /** Get the number of components within the group */
+    size_t size() const;
+    /** Check if the component group is empty */
+    bool empty() const;
+    ComponentPtr &operator[](size_t index);
+    const ComponentPtr &operator[](size_t index) const;
+    typename std::vector<ComponentPtr>::iterator       begin();
+    typename std::vector<ComponentPtr>::iterator       end();
+    typename std::vector<ComponentPtr>::const_iterator begin() const;
+    typename std::vector<ComponentPtr>::const_iterator end() const;
+    typename std::vector<ComponentPtr>::const_iterator cbegin() const;
+    typename std::vector<ComponentPtr>::const_iterator cend() const;
+
+private:
+    std::vector<const ITensorInfo *> get_interm_tensors() const;
+
+    static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors);
+
+    std::vector<ComponentPtr> _components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
new file mode 100644
index 0000000000..aac84b6c59
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentStream.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
+    : _services{ services }, _component_groups{}, _mem_map{ mem_map }
+{
+}
+
+GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
+{
+    GpuWorkloadSourceCode source_code;
+    // Traverse through component groups and assemble workload together
+    for(auto && group : _component_groups)
+    {
+        // Write kernel code
+        GpuLogicalKernel          logical_kernel(_services, group);
+        const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
+        // The whole unit workload stage is determined by the root component
+        const auto unit_workload_stage = group.get_root_component()->properties().stage();
+        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map);
+    }
+    return source_code;
+}
+
+void GpuKernelComponentStream::new_component_group()
+{
+    _component_groups.emplace_back();
+}
+
+bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component)
+{
+    ARM_COMPUTE_ERROR_ON(_component_groups.empty());
+    return _component_groups.back().add_component(component);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
new file mode 100644
index 0000000000..cbaa7c297b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph
+ *  Each component group in the stream denotes a complete kernel that may consist of multiple components
+ *
+ * The main purposes of this class are:
+ *  - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream
+ *  - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode
+ */
+class GpuKernelComponentStream
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services @ref GpuComponentServices to be used throughout the stream
+     * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
+     */
+    GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default;
+    /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */
+    GpuWorkloadSourceCode write_workload_code();
+    /** Insert a new component group in the stream.
+     * Subsequent components are added to this group until end of stream or the next new_component_group is called
+     */
+    void new_component_group();
+    /** Add a component to the previously created component group
+     *  Throw an error if no component group is present in the stream
+     *
+     * @param[in] component Component to be inserted
+     *
+     * @return true      If the operation is successful
+     * @return false     Otherwise
+     */
+    bool add_component(IGpuKernelComponent *component);
+
+private:
+    GpuComponentServices                *_services;
+    std::vector<GpuKernelComponentGroup> _component_groups{};
+    MemoryDescriptorMap                  _mem_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
new file mode 100644
index 0000000000..7479328d7b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** The argument list of a @ref GpuKernelSourceCode */
+using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
+
+/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
+ */
+class GpuKernelSourceCode
+{
+public:
+    /** Set kernel name */
+    GpuKernelSourceCode &name(const std::string &n)
+    {
+        _name = n;
+        return *this;
+    }
+    /** Set kernel code */
+    GpuKernelSourceCode &code(const std::string &c)
+    {
+        _code = c;
+        return *this;
+    }
+    /** Set kernel config id string */
+    GpuKernelSourceCode &config_id(const std::string &c_id)
+    {
+        _config_id = c_id;
+        return *this;
+    }
+    /** Set kernel build options */
+    GpuKernelSourceCode &build_options(const CLBuildOptions &b_options)
+    {
+        _build_options = b_options;
+        return *this;
+    }
+    /** Set kernel execution window */
+    GpuKernelSourceCode &window(const Window &window)
+    {
+        _window = window;
+        return *this;
+    }
+    /** Set kernel argument list */
+    GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments)
+    {
+        _arguments = arguments;
+        return *this;
+    }
+    /** Get kernel name */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Get kernel code */
+    std::string code() const
+    {
+        return _code;
+    }
+    /** Get kernel config id string */
+    std::string config_id() const
+    {
+        return _config_id;
+    }
+    /** Get kernel build options */
+    const CLBuildOptions &build_options() const
+    {
+        return _build_options;
+    }
+    /** Get kernel execution window */
+    const Window &window() const
+    {
+        return _window;
+    }
+    /** Get kernel argument list */
+    const GpuKernelArgumentList &arguments() const
+    {
+        return _arguments;
+    }
+
+private:
+    std::string           _name{};
+    std::string           _code{};
+    std::string           _config_id{};
+    CLBuildOptions        _build_options{};
+    Window                _window{};
+    GpuKernelArgumentList _arguments{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
new file mode 100644
index 0000000000..7746f8bbf3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuLogicalKernel.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
+    : _services{ services }, _comp_group{ components }, _store_components{}
+{
+    add_load_store();
+}
+
+GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
+{
+    GpuKernelSourceCode code;
+    ClTemplateWriter    writer{ _comp_group };
+
+    code.name(writer.get_name());
+    code.code(writer.get_code());
+    code.arguments(writer.get_tensors());
+    code.build_options(writer.get_build_options());
+    code.config_id(writer.get_config_id());
+    code.window(writer.get_window());
+
+    return code;
+}
+
+void GpuLogicalKernel::add_load_store()
+{
+    const auto dst_tensors = _comp_group.get_dst_tensors();
+    // Each dst tensor from the component group requires exactly one store component
+    for(const auto &dst_tensor : dst_tensors)
+    {
+        ArgumentPack<ITensorInfo> tensors;
+        // Pass same destination tensor to both source and destination of the store component
+        // In other words, the addition of a store component does not create a new dst tensor
+        // This way we avoid the issue of the dst tensor of the component group differs from that of a logical kernel
+        // This may seem to violate the acyclic-ness of the component graph. But it is fine because at the point of
+        // the construction of the logical kernel, we do not need a graph representation of components anymore
+        // (the graph has been serialized)
+        tensors.add_const_tensor(ACL_SRC_0, dst_tensor);
+        tensors.add_const_tensor(ACL_DST_0, dst_tensor);
+
+        auto store = _services->component_factory().create<ClComponentStore>(
+                         _comp_group.get_root_component()->properties(), // Store component share the same properties as that of the root component
+                         tensors);
+        _store_components.push_back(std::move(store));
+        auto success = _comp_group.add_component(_store_components.back().get());
+        ARM_COMPUTE_ERROR_ON(!success); // It's guaranteed that any load store insertion should be successful
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
new file mode 100644
index 0000000000..4ce4443f60
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A wrapper-processor of a @ref GpuKernelComponentGroup
+ * It adds the load (if any) and store components to the component group
+ * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code
+ */
+class GpuLogicalKernel
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services   @ref GpuComponentServices to be used
+     * @param[in] components Component group from which this logical kernel is initialized
+     */
+    explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components);
+    /** Allow instances of this class to be copy constructed */
+    GpuLogicalKernel(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be copied */
+    GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuLogicalKernel(GpuLogicalKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default;
+    /** Generate a @ref GpuKernelSourceCode */
+    GpuKernelSourceCode write_kernel_code();
+
+private:
+    void add_load_store();
+
+    GpuComponentServices                             *_services;
+    GpuKernelComponentGroup                           _comp_group{};
+    std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
new file mode 100644
index 0000000000..e8ef835405
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(
+        std::begin(tensors), std::end(tensors),
+        std::back_inserter(tensor_ids),
+        [](const auto & t)
+    {
+        return t->id();
+    });
+    return tensor_ids;
+}
+
+} // namespace
+
+Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
+    : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+{
+}
+
+OperatorId Operator::id() const
+{
+    return _id;
+}
+
+GpuOperatorType Operator::operator_type() const
+{
+    return _operator_type;
+}
+
+ArgumentPack<ITensorInfo> Operator::tensors() const
+{
+    return _tensors;
+}
+
+bool GpuOperatorGroup::try_add_operator(const Operator &op) const
+{
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    // Constraint 1
+    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids))
+    {
+        return false;
+    }
+    // Constraint 2
+    if(_operators.size() >= max_fused_operators)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable)
+    if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    {
+        return false;
+    }
+    // Constraint 4
+    if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if(_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = op.tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 6
+    if(_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = op.tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+void GpuOperatorGroup::add_operator(const Operator &op)
+{
+    ARM_COMPUTE_ERROR_ON(!try_add_operator(op));
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids);
+    _operators[op.id()] = op;
+}
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+{
+    auto new_id = static_cast<OperatorId>(_operators.size());
+    return Operator{ new_id, operator_type, tensors };
+}
+const Operator *GpuOperatorGroup::get_root_operator() const
+{
+    const auto roots = _graph.get_root_ops();
+    ARM_COMPUTE_ERROR_ON(roots.size() > 1);
+    if(roots.empty())
+    {
+        return nullptr;
+    }
+    return &_operators.at(roots[0]);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
new file mode 100644
index 0000000000..35abe6c543
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using OperatorId = DependencyGraph::OperatorId;
+
+/** An operator for the sole purpose of validating fusion
+ */
+class Operator
+{
+public:
+    /** Default constructor */
+    Operator() = default;
+    /** Get Operator Id */
+    OperatorId id() const;
+    /** Get operator type */
+    GpuOperatorType operator_type() const;
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const;
+    friend class GpuOperatorGroup;
+
+private:
+    Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors);
+    OperatorId                _id{};
+    GpuOperatorType           _operator_type{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+
+/** A linear sequence of operators to be fused in a workload
+ *  For the time being, this class is only used for validating operator fusion
+ * INVARIANTS:
+ * @note These invariants are exactly the same as operator fusion constraints
+ * 1. Fusion is limited to a linear sequence of operators
+ * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators
+ * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable
+ * 4. All operator but unfusable, have exactly 1 dst tensor
+ * 5. All fused operators share the same dst tensor shape
+ * 6. All fused operators' tensors share the same @ref DataLayout
+ */
+class GpuOperatorGroup
+{
+public:
+    static constexpr size_t max_fused_operators = 32;
+    /** Try adding (without actually adding) an operator to the group
+     *
+     * @param[in] op Operator to be added
+     *
+     * @return true   If @p op can be added while maintaining the invariants
+     * @return false  Otherwise
+     */
+    bool try_add_operator(const Operator &op) const;
+    /** Add an operator to the group
+     *
+     * @param[in] op Operator to be added
+     */
+    void add_operator(const Operator &op);
+    /** Create a new operator
+     *
+     * @param[in] operator_type @ref GpuOperatorType of the new operator
+     * @param[in] tensors       Tensor arguments to the new operator
+     *
+     * @return Operator
+     */
+    Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const;
+    /** Get the "root operator" of the group, which is the first operator in a linear sequence
+     * @return const Operator* Pointer to the root operator
+     */
+    const Operator *get_root_operator() const;
+
+private:
+    DependencyGraph _graph{};
+    std::map<OperatorId, Operator> _operators{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
new file mode 100644
index 0000000000..c77697c343
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain properties common to all operator types */
+
+/** Operator type in the context of fusion
+ */
+enum class GpuOperatorType
+{
+    /** Simple operators are operators that:
+     *  1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise
+     *  2. Have exactly 1 output
+     */
+    Simple,
+    /** Complex operators are operators that are not simple but are still fusable with simple ones
+     */
+    Complex,
+    /** Unfusable operators are operators that cannot be fused with any other types of operators
+     */
+    Unfusable
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
new file mode 100644
index 0000000000..623bf351f8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
+    : _gpu_language{ GpuLanguage::OpenCL }, _cl_compile_ctx{ cl_compile_ctx }
+{
+}
+
+GpuTarget GpuWorkloadContext::gpu_target() const
+{
+    return _cl_compile_ctx->get_gpu_target();
+}
+
+GpuLanguage GpuWorkloadContext::gpu_language() const
+{
+    return _gpu_language;
+}
+
+const CLCompileContext *GpuWorkloadContext::cl_compile_context() const
+{
+    return _cl_compile_ctx;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
new file mode 100644
index 0000000000..ce7cf1e908
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
+    : _impl{ std::make_unique<Implementation>(context) }
+{
+}
+GpuWorkloadSketch::~GpuWorkloadSketch()
+{
+}
+
+const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const
+{
+    return _impl->context();
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info(const ITensorInfo &tensor_info)
+{
+    TensorInfo tensor{ tensor_info };
+    tensor.set_id(allocate_new_tensor_id());
+    return tensor;
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info()
+{
+    TensorInfo tensor{};
+    tensor.set_id(allocate_new_tensor_id());
+    return tensor;
+}
+
+ITensorInfo::Id GpuWorkloadSketch::allocate_new_tensor_id()
+{
+    return _impl->allocate_new_tensor_id();
+}
+
+GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation()
+{
+    return *_impl;
+}
+const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const
+{
+    return *_impl;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
new file mode 100644
index 0000000000..3997395c98
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of @ref GpuWorkloadSketch */
+class GpuWorkloadSketch::Implementation
+{
+public:
+    /** Constructor
+     *
+     * @param[in] context global workload creation context
+     */
+    explicit Implementation(
+        Context *context)
+        : _context{ context },
+          _comp_services{},
+          _component_graph{ &_comp_services },
+          _operator_group{}
+    {
+    }
+    /** Prevent instances of this class from being copy constructed */
+    Implementation(const Implementation &impl) = delete;
+    /** Prevent instances of this class from being copied */
+    Implementation &operator=(const Implementation &impl) = delete;
+    /** Allow instances of this class to be move constructed */
+    Implementation(Implementation &&impl) = default;
+    /** Allow instances of this class to be moved */
+    Implementation &operator=(Implementation &&impl) = default;
+    /** Get workload context */
+    const Context *context() const
+    {
+        return _context;
+    }
+    /** Get component graph */
+    const GpuKernelComponentGraph &component_graph() const
+    {
+        return _component_graph;
+    }
+    /** Get component graph */
+    GpuKernelComponentGraph &component_graph()
+    {
+        return _component_graph;
+    }
+    /** Get operator group */
+    const GpuOperatorGroup &operator_group() const
+    {
+        return _operator_group;
+    }
+    /** Get operator group */
+    GpuOperatorGroup &operator_group()
+    {
+        return _operator_group;
+    }
+    ITensorInfo::Id allocate_new_tensor_id()
+    {
+        return ++_next_id;
+    }
+    /** Generate @ref GpuWorkloadSourceCode from the workload sketch
+     * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
+     *
+     * @return GpuWorkloadSourceCode  The generated workload code
+     */
+    GpuWorkloadSourceCode generate_source_code() const
+    {
+        return component_graph().fuse().write_workload_code();
+    }
+
+private:
+    Context                *_context;
+    GpuComponentServices    _comp_services;
+    GpuKernelComponentGraph _component_graph;
+    GpuOperatorGroup        _operator_group;
+    ITensorInfo::Id         _next_id{ ITensorInfo::invalid_tensor_id };
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
new file mode 100644
index 0000000000..2375f5c6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
+using UnitWorkloadId = int32_t;
+
+/** Describes all the info related to a kernel in order to:
+ *  - be used by runtime to configure gpu kernel argument
+ *  - be used by memory managers to allocate required memory
+ */
+class GpuWorkloadArgument
+{
+public:
+    /** Default constructor */
+    GpuWorkloadArgument() = default;
+    /** Constructor
+     *
+     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument
+     */
+    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
+                        const MemoryDescriptor      &mem_desc,
+                        const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{ tensor_info },
+          _mem_desc{ mem_desc },
+          _kernel_arg_info{ kernel_arg_info }
+    {
+    }
+    /** Get tensor id within workload */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get @ref ITensorInfo of the argument */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref ITensorInfo of the argument */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    MemoryDescriptor *memory_descriptor()
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    const MemoryDescriptor *memory_descriptor() const
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref GpuKernelArgumentInfo of the argument */
+    GpuKernelArgumentInfo *kernel_argument_info()
+    {
+        return &_kernel_arg_info;
+    }
+    /** Get @ref GpuKernelArgumentInfo of the argument */
+    const GpuKernelArgumentInfo *kernel_argument_info() const
+    {
+        return &_kernel_arg_info;
+    }
+    /** Check if the workload argument has valid id
+     *
+     * @return true   If has valid id
+     * @return false  Otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    MemoryDescriptor      _mem_desc{};
+    GpuKernelArgumentInfo _kernel_arg_info{};
+};
+
+/** Describes when a unit workload is run.
+ */
+struct UnitWorkloadStage
+{
+    enum class Stage
+    {
+        Prepare, /**< Only run once at the beginning. */
+        Run,     /**< Run every time after the first time. */
+    };
+    Stage stage{ Stage::Run };
+};
+
+inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+{
+    return stage0.stage == stage1.stage;
+}
+
+/** The atomic unit in a Gpu workload. It contains exactly one kernel to run.
+ */
+class GpuUnitWorkload
+{
+public:
+    /** Default constructor */
+    GpuUnitWorkload() = default;
+    /** Constructor
+     *
+     * @param[in] id          Id that uniquely identifies this unit workload in a workload
+     * @param[in] kernel_code @ref GpuKernelSourceCode contained within
+     * @param[in] stage       Stage of the unit workload
+     */
+    GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
+        : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+    {
+    }
+    /** Get the id of the unit workload */
+    UnitWorkloadId id() const
+    {
+        return _id;
+    }
+    /** Get reference to the underlying @ref GpuKernelSourceCode */
+    const GpuKernelSourceCode &code() const
+    {
+        return _kernel_code;
+    }
+    /** Get the stage of the unit workload */
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadId      _id{};
+    GpuKernelSourceCode _kernel_code{};
+    UnitWorkloadStage   _stage{};
+};
+
+/** Hold the generated kernel source code and other information required to compile and run the workload.
+ */
+class GpuWorkloadSourceCode
+{
+public:
+    /** Default constructor */
+    GpuWorkloadSourceCode() = default;
+    /** Add a unit workload to the workload code
+     *
+     * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
+     * @param[in] stage       Stage of the unit workload
+     * @param[in] mem_map     @ref MemoryDescriptor map for all tensors within the unit workload
+     *
+     * @return UnitWorkloadId  Allocated unit workload id
+     */
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map)
+    {
+        // Use the size of the kernel codes as Id
+        const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
+        const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
+        _unit_workloads.push_back(unit_work);
+        // Assemble kernel argument with memory descriptor to form workload argument
+        for(const auto &id_arg : kernel_code.arguments())
+        {
+            const auto arg_id           = id_arg.first;
+            const auto arg              = id_arg.second;
+            _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
+            if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+            {
+                _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
+            }
+            _tensor_uwork_map[arg_id].insert(uwk_id);
+        }
+        return uwk_id;
+    }
+    /** Get a unit workload from its id */
+    const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const
+    {
+        ARM_COMPUTE_ERROR_ON(id < 0);
+        return _unit_workloads.at(id);
+    }
+    /** Get all unit workloads sorted in topological order */
+    std::vector<UnitWorkloadId> unit_workloads() const
+    {
+        std::vector<UnitWorkloadId> ids{};
+
+        for(const auto &uwk : _unit_workloads)
+        {
+            ids.push_back(uwk.id());
+        }
+        return ids;
+    }
+    /** Get a @ref GpuWorkloadArgument from its associated tensor id */
+    const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const
+    {
+        return &_workload_arguments.at(t_id);
+    }
+    /** Get all tensors in the entire workload */
+    std::vector<ITensorInfo::Id> tensors() const
+    {
+        std::vector<ITensorInfo::Id> ids{};
+        for(const auto &id_tensor : _workload_arguments)
+        {
+            ids.push_back(id_tensor.first);
+        }
+        return ids;
+    }
+    /** Get all unit workloads connected to the tensor with @p t_id */
+    std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const
+    {
+        const auto unit_work_set = _tensor_uwork_map.at(t_id);
+        return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end());
+    }
+
+private:
+    std::vector<GpuUnitWorkload> _unit_workloads{};
+    std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
+    std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
new file mode 100644
index 0000000000..ae67790b4b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** An interface that can write a gpu kernel
+ */
+class IGpuKernelWriter
+{
+public:
+    /** Destructor */
+    virtual ~IGpuKernelWriter()
+    {
+    }
+    /** Generate kernel name */
+    virtual std::string get_name() = 0;
+    /** Generate kernel code */
+    virtual std::string get_code() = 0;
+    /** Generate build options */
+    virtual CLBuildOptions get_build_options() = 0;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    virtual std::string get_config_id() = 0;
+    /** Generate execution window */
+    virtual Window get_window() const = 0;
+    /** Get the kernel argument lists of the kernel*/
+    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
new file mode 100644
index 0000000000..f7f0029618
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+
+#include "Types.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids
+ */
+class GpuKernelComponentFactory
+{
+public:
+    /** Create a new kernel component
+     *
+     * @tparam T      Any polymorphic type descending from @ref IGpuKernelComponent
+     * @tparam Args   Argument types to construct the kernel component
+     *
+     * @param[in] args Arguments to construct the kernel component
+     *
+     * @return std::unique_ptr<IGpuKernelComponent>
+     */
+    template <typename T, typename... Args>
+    std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+    {
+        return std::make_unique<T>(_count++, std::forward<Args>(args)...);
+    }
+
+private:
+    ComponentId _count{ 0 };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
new file mode 100644
index 0000000000..8bb19155a2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+
+#include "Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Properties common to all kernel component types */
+class KernelProperties
+{
+public:
+    KernelProperties &stage(const UnitWorkloadStage &stage)
+    {
+        _stage = stage;
+        return *this;
+    }
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadStage _stage{};
+};
+
+inline bool operator==(const KernelProperties &config0, const KernelProperties &config1)
+{
+    return config0.stage() == config1.stage();
+}
+
+/** Forward declaration */
+class IGpuTemplateComponentWriter;
+
+/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion
+ */
+class IGpuKernelComponent
+{
+public:
+    using Properties = KernelProperties;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] properties Kernel component properties
+     * @param[in] tensors    Tensor arguments to the components
+     */
+    IGpuKernelComponent(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors)
+        : _id{ id },
+          _properties{ properties },
+          _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuKernelComponent()
+    {
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get properties */
+    Properties properties() const
+    {
+        return _properties;
+    }
+    /** Get template writer for the component */
+    virtual const IGpuTemplateComponentWriter *template_writer() const = 0;
+    /** Get component type */
+    virtual GpuComponentType type() const = 0;
+
+private:
+    ComponentId               _id{ -1 };
+    Properties                _properties{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */
diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h
new file mode 100644
index 0000000000..54b3a69057
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/Types.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a kernel component within a workload
+ */
+using ComponentId = int32_t;
+
+/** Component type in the context of fusion
+ *  Its main purpose is to inform the optimizer how to perform fusion.
+ */
+enum class GpuComponentType
+{
+    Complex,
+    Simple,
+    Unfusable,
+    Output
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
new file mode 100644
index 0000000000..e94cfd1581
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using Settings = ClComponentDirectConv2dSettings;
+
+Settings &Settings::export_to_cl_image(bool cl_image)
+{
+    _export_to_cl_image = cl_image;
+    return *this;
+}
+
+bool Settings::export_to_cl_image() const
+{
+    return _export_to_cl_image;
+}
+
+Settings &Settings::fast_relaxed_math(bool fast_relaxed_math)
+{
+    _fast_relaxed_math = fast_relaxed_math;
+    return *this;
+}
+
+bool Settings::fast_relaxed_math() const
+{
+    return _fast_relaxed_math;
+}
+
+Status ClComponentDirectConv2d::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties, settings);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+    // 1. Check validity
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+    }
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+    }
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+    }
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    // wei shape is correct
+    const DataLayout data_layout = src->data_layout();
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    // dst shape is correct
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                    attributes.pad().bottom, DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+
+    // bia shape is correct
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
+                                        "Biases should be one dimensional");
+    }
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+ClComponentDirectConv2d::ClComponentDirectConv2d(
+    ComponentId                      id,
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+    : IGpuKernelComponent{ id, properties, tensors },
+      _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+{
+}
+ClComponentDirectConv2d::~ClComponentDirectConv2d()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
new file mode 100644
index 0000000000..fec22b84a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Conv2dAttributes;
+
+/** Component specific settings
+ */
+class ClComponentDirectConv2dSettings
+{
+public:
+    /** Set export_to_cl_image flag */
+    ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image);
+    /** Get export_to_cl_image flag */
+    bool export_to_cl_image() const;
+
+    /** Set fast_relaxed_math flag */
+    ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+    /** Get fast_relaxed_math flag */
+    bool fast_relaxed_math() const;
+
+private:
+    bool _export_to_cl_image{ false };
+    bool _fast_relaxed_math{ true };
+};
+
+/** Forward declaration */
+class ClTemplateDirectConv2d;
+
+class ClComponentDirectConv2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = Conv2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = ClComponentDirectConv2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_SRC_1: Weight
+     * - ACL_SRC_2: Bias (Optional)
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_SRC_2: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_SRC_2      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     */
+    ClComponentDirectConv2d(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentDirectConv2d() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
new file mode 100644
index 0000000000..f49f397ec1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentStore.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentStore::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors)
+{
+    ARM_COMPUTE_UNUSED(properties, tensors);
+    return Status{};
+}
+ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) }
+{
+}
+ClComponentStore::~ClComponentStore()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
new file mode 100644
index 0000000000..bf8c9f031e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+class ClComponentStore final : public IGpuKernelComponent
+{
+public:
+    /** Validate the component
+     *
+     * @param[in] properties Component properties
+     * @param[in] tensors    Tensor arguments to the components
+     *
+     * @return Status        Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors);
+    /** Constructor
+     *
+     * Similar to @ref ClComponentStore::validate()
+     */
+    ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+    /** Destructor */
+    ~ClComponentStore() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentStore(const ClComponentStore &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentStore &operator=(const ClComponentStore &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentStore(ClComponentStore &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentStore &operator=(ClComponentStore &&component) = default;
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Output;
+    }
+
+private:
+    std::unique_ptr<ClTemplateStore> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
new file mode 100644
index 0000000000..98c1cc3939
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout)
+{
+    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
+    {
+        return false;
+    }
+
+    // If not floating point
+    if(!is_data_type_float(tensor->data_type()))
+    {
+        return false;
+    }
+
+    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+    {
+        return false;
+    }
+
+    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+    if(!image2d_from_buffer_supported(device))
+    {
+        return false;
+    }
+
+    // Check cl image pitch alignment
+    if(get_cl_image_pitch_alignment(device) == 0)
+    {
+        return false;
+    }
+
+    const size_t image_w     = tensor->tensor_shape()[0] / 4;
+    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
+    const size_t max_image_w = device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+    const size_t max_image_h = device.getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+    if(image_w > max_image_w || image_h > max_image_h)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const ITensorInfo       *dst,
+                              const Conv2dAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        !src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id());
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+    }
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate = *dst;
+    const auto data_layout          = src->data_layout();
+
+    {
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+                                                                            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                                                          attributes.pad().right,
+                                                                                          attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate Direct Conv2d Component
+        {
+            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            auto       settings   = ClComponentDirectConv2d::Settings();
+
+            settings.export_to_cl_image(
+                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+                && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16));
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+void GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
+                          ITensorInfo            *src,
+                          ITensorInfo            *wei,
+                          ITensorInfo            *bia,
+                          ITensorInfo            *dst,
+                          const Conv2dAttributes &attributes)
+{
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst);
+    const auto data_layout = src->data_layout();
+
+    // Auto initialize dst tensor
+    {
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+                                                                            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                                                          attributes.pad().right,
+                                                                                          attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add Direct Conv2d Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+
+            auto settings = ClComponentDirectConv2d::Settings();
+
+            settings.export_to_cl_image(
+                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+                && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+
+            if(settings.export_to_cl_image())
+            {
+                arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+            }
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
new file mode 100644
index 0000000000..13c0b141a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelVariableTable.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+    // Do not re-declare if the variable associated with the tensor has already been declared
+    if(get_variable(tensor).has_valid_id())
+    {
+        ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info));
+        return;
+    }
+    // Declare variable associated with the tensor
+    std::stringstream ss;
+    ss << alias << "_t" << tensor->id();
+    const auto     uniq_name = ss.str();
+    TensorVariable var{ tensor->id(), uniq_name, argument_info };
+
+    if(is_interm)
+    {
+        _interm_var = var;
+        _interm_tensors.insert(tensor->id());
+    }
+    else
+    {
+        _vars.emplace(tensor->id(), var);
+    }
+}
+
+GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
+{
+    const TensorVariable empty_var{};
+    if(_vars.find(tensor->id()) != _vars.end())
+    {
+        return _vars.at(tensor->id());
+    }
+    if(_interm_tensors.find(tensor->id()) != _interm_tensors.end())
+    {
+        return _interm_var;
+    }
+    return empty_var;
+}
+
+GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+{
+    VariableList vars{};
+    for(const auto &tensor : tensors)
+    {
+        if(!tensor->has_valid_id())
+        {
+            continue;
+        }
+        vars.push_back(get_variable(tensor));
+    }
+    return vars;
+}
+
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
+    : value{ var.uniq_name }
+{
+}
+
+TagVal::TagVal(const std::string &val)
+    : value{ val }
+{
+}
+
+TagVal::TagVal(const char *val)
+    : value{ std::string(val) }
+{
+}
+
+TagVal::TagVal(const DataType &data_type)
+    : value{ get_cl_type_from_data_type(data_type) }
+{
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
new file mode 100644
index 0000000000..4eee3963c2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "support/Requires.h"
+#include "support/StringSupport.h"
+
+#include <set>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A table of all the variables used in the kernel
+ * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed.
+ * Each kernel has exactly one variable table
+ */
+class GpuKernelVariableTable
+{
+public:
+    /** A tensor variable whose main purposes are:
+     *  - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
+     *  - Hold the generated variable name for the associated tensor info
+     */
+    struct TensorVariable
+    {
+    public:
+        TensorVariable()                       = default;
+        TensorVariable(const TensorVariable &) = default;
+        TensorVariable       &operator=(const TensorVariable &) = default;
+        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
+        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        GpuKernelArgumentInfo kernel_argument_info{};
+        bool                  has_valid_id() const
+        {
+            return id != ITensorInfo::invalid_tensor_id;
+        }
+    };
+    using VariableList = std::vector<TensorVariable>;
+
+public:
+    /** Declare a @ref TensorVariable for a corresponding tensor info.
+     *
+     * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo
+     *        Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order
+     *
+     * @param[in] tensor        Tensor info with which the new variable is associated
+     * @param[in] argument_info Kernel argument information
+     * @param[in] is_interm     If the new variable is an intermediate variable
+     * @param[in] alias         Alias for the variable. Will be used as part of the variable name
+     */
+    void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed");
+    /** Get the @ref TensorVariable associated with @p tensor
+     *
+     * @param[in] tensor Tensor info to be queried
+     *
+     * @return TensorVariable
+     */
+    TensorVariable get_variable(const ITensorInfo *tensor) const;
+    /** Get the @ref TensorVariable list associated with @p tensors
+     * @note Empty tensors are skipped
+     *
+     * @param[in] tensors List of tensor infos to be queried
+     *
+     * @return VariableList
+     */
+    VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
+
+private:
+    std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/
+    TensorVariable            _interm_var{};           /**< Intermediate variable */
+    std::set<ITensorInfo::Id> _interm_tensors{};       /**< Tensors associated with the single intermediate variable */
+};
+
+/** A tag value will substitute a tag in a string template during its instantiation */
+struct TagVal
+{
+    /** Default constructor */
+    TagVal() = default;
+    /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
+    TagVal(const GpuKernelVariableTable::TensorVariable &var);
+    /** Construct a @ref TagVal from an integral type */
+    template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
+    TagVal(T val)
+        : value{ support::cpp11::to_string(val) }
+    {
+    }
+    /** Construct a @ref TagVal from a string */
+    TagVal(const std::string &val);
+    /** Construct a @ref TagVal from a c-style string */
+    TagVal(const char *val);
+    /** Construct a @ref TagVal from a @ref DataType */
+    TagVal(const DataType &data_type);
+    /** Get the value of the TagVal as a converted string */
+    std::string value{};
+};
+
+/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
+using Tag = std::string;
+
+/** Tag lookup table. It is used to instantiate a string template */
+using TagLUT = std::unordered_map<Tag, TagVal>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
new file mode 100644
index 0000000000..c85ddf5a2c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuKernelVariableTable;
+
+/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
+ */
+class IGpuTemplateComponentWriter
+{
+public:
+    using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> tensors)
+        : _id{ id }, _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuTemplateComponentWriter()
+    {
+    }
+    /** Generate kernel component name */
+    virtual std::string get_name() const = 0;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate additional macros used in the component */
+    virtual std::string get_additional_macros() const
+    {
+        return "";
+    }
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
+    {
+        ARM_COMPUTE_UNUSED(comp_group);
+        return CLBuildOptions{};
+    }
+    /** Generate the component config id string used for tuning */
+    virtual std::string get_config_id() const
+    {
+        return "";
+    }
+    /** Generate the header list used in the component */
+    virtual std::set<std::string> get_headers_list() const
+    {
+        return std::set<std::string> {};
+    }
+    /** Generate the execution window for the component */
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+
+private:
+    ComponentId               _id{ -1 };
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
new file mode 100644
index 0000000000..870de64eb8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateDirectConv2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
+                                               const ArgumentPack<ITensorInfo> &tensors,
+                                               const Attributes                &attributes,
+                                               const Settings                  &settings)
+    : IGpuTemplateComponentWriter{ id, tensors },
+      _src{},
+      _weight{},
+      _bias{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
+}
+
+std::string ClTemplateDirectConv2d::get_name() const
+{
+    return "direct_conv2d";
+}
+
+std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const auto k0            = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
+    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
+
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// IN_1(wei)            {{weight}}
+)_";
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+// IN_1(bia)            {{bias}}
+)_";
+    }
+    code += R"_(
+// OUT(dst, accum)      {{dst}}
+
+// Initialize the accumulators
+TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH {{WEI_WIDTH}}
+#define _IWEI_HEIGHT {{WEI_HEIGHT}}
+#define _ISRC_WIDTH {{src}}_w
+#define _ISRC_HEIGHT {{src}}_h
+#define _ISRC_CHANNELS {{src}}_c
+#define _IDST_WIDTH {{arg_dst}}_w
+#define _IDST_HEIGHT {{arg_dst}}_h
+#define _IDST_CHANNELS {{arg_dst}}_c
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
+        yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+        xi[i].v -= {{PAD_LEFT}};
+        yi[i].v -= {{PAD_TOP}};
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        {{dst}}[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+            ck += K0;
+        }
+
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+)_";
+
+    if(leftover_loop)
+    {
+        code += R"_(
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+            ++ck;
+        }
+    )_";
+}
+
+code += R"_(
+#undef _I_WEI_WIDTH
+#undef _I_WEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+
+    }
+)_";
+
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+    )_";
+}
+
+code += R"_(
+}
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+    return code;
+}
+
+void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(
+        _weight,
+        GpuKernelArgumentInfo(weight_type),
+        comp_group.is_intermediate_tensor(_weight),
+        "weight");
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        vtable.declare_variable(
+            _bias,
+            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
+            comp_group.is_intermediate_tensor(_bias),
+            "bias");
+    }
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+    // Arguments and global shared variables
+    lut["src"]    = vtable.get_variable(_src);
+    lut["weight"] = vtable.get_variable(_weight);
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        lut["bias"]          = vtable.get_variable(_bias);
+        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
+    }
+    lut["dst"] = vtable.get_variable(_dst);
+
+    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    lut["arg_dst"]          = dst_argument.uniq_name;
+
+    // Local build options
+    lut["meta_kernel_id"] = id();
+    lut["ACC_DATA_TYPE"]  = _src->data_type();
+    lut["SRC_DATA_TYPE"]  = _src->data_type();
+    lut["WEI_DATA_TYPE"]  = _weight->data_type();
+
+    lut["SRC_TENSOR_TYPE"] = "BUFFER";
+    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+    {
+        lut["WEI_TENSOR_TYPE"] = "IMAGE";
+        break;
+    }
+        default:
+    {
+        lut["WEI_TENSOR_TYPE"] = "BUFFER";
+        break;
+    }
+    }
+    const auto width_idx  = 1;
+    const auto height_idx = 2;
+    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+
+    lut["STRIDE_X"] = _attributes.stride().x();
+    lut["STRIDE_Y"] = _attributes.stride().y();
+
+    lut["PAD_LEFT"] = _attributes.pad().left;
+    lut["PAD_TOP"]  = _attributes.pad().top;
+
+    lut["ZERO_VALUE"] = 0;
+
+    return lut;
+}
+
+CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
+{
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataType     data_type   = _src->data_type();
+
+    /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
+    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0               = root_window.x().step();
+    const unsigned int m0               = root_window.y().step();
+    const unsigned int k0               = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
+    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+    CLBuildOptions build_opts{};
+    if(_settings.fast_relaxed_math())
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+    else
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    build_opts.add_option("-DIS_TILED");
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClTemplateDirectConv2d::get_config_id() const
+{
+    const DataType   data_type   = _src->data_type();
+    const DataLayout data_layout = _src->data_layout();
+
+    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const unsigned int kernel_size = _weight->dimension(width_idx);
+
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(data_type));
+    config_id += "_";
+    config_id += support::cpp11::to_string(kernel_size);
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().x());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().y());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(data_layout));
+    return config_id;
+}
+
+std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
+{
+    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+}
+
+Window ClTemplateDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto output_shape = _dst->tensor_shape();
+
+    const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
+    const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+    // Create and configure kernel window
+    Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
new file mode 100644
index 0000000000..48027a9b8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    ClTemplateDirectConv2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+    /** Generate the component config id string used for tuning */
+    std::string get_config_id() const override;
+    /** Generate the header list used in the component */
+    std::set<std::string> get_headers_list() const override;
+    /** Generate the execution window for the component */
+    Window get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_weight;
+    const ITensorInfo *_bias;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
new file mode 100644
index 0000000000..6c4b8f52f2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateStore.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+
+std::string ClTemplateStore::get_name() const
+{
+    return "store";
+}
+
+std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    return R"_(
+//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+{
+// This also follows NHWC layout
+// g_ind_0 maps to global_id(0) maps to Channel
+// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window)
+// g_ind_2 maps to global_id(2) maps to N / Batch
+#define _IDST_WIDTH {{dst}}_w
+#define _IDST_HEIGHT {{dst}}_h
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+    //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+}
+
+)_";
+}
+
+void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    // ARM_COMPUTE_UNUSED(comp_group)
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+
+    // Arguments and global shared variables
+    lut["src"] = vtable.get_variable(_src);
+    lut["dst"] = vtable.get_variable(_dst);
+
+    // Local build options
+    lut["meta_kernel_id"]  = id();
+    lut["DST_TENSOR_TYPE"] = "BUFFER";
+    const auto dst_info    = comp_group.get_dst_tensors()[0];
+    lut["DST_DATA_TYPE"]   = dst_info->data_type();
+
+    return lut;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
new file mode 100644
index 0000000000..3f97a82204
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateStore final : public IGpuTemplateComponentWriter
+{
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateStore(const ClTemplateStore &store) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateStore(ClTemplateStore &&store) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
new file mode 100644
index 0000000000..cb643a741d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateWriter.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
+std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
+{
+    std::string replaced_code    = "";
+    bool        scanning_pattern = false;
+    std::string pattern_found    = "";
+    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    {
+        if(!scanning_pattern)
+        {
+            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            {
+                i += 1;
+                scanning_pattern = true;
+                pattern_found    = "";
+            }
+            else
+            {
+                replaced_code += code_template[i];
+            }
+        }
+        else
+        {
+            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            {
+                i += 1;
+                scanning_pattern = false;
+                std::string err  = "Pattern " + pattern_found + " not found in tags";
+                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
+                replaced_code += tags.find(pattern_found)->second.value;
+            }
+            else
+            {
+                pattern_found += code_template[i];
+            }
+        }
+    }
+
+    return replaced_code;
+}
+ClTemplateWriter::~ClTemplateWriter()
+{
+}
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
+    : _components{ components }
+{
+}
+std::string ClTemplateWriter::get_name()
+{
+    return write_kernel_name();
+}
+std::string ClTemplateWriter::get_code()
+{
+    return write_code();
+}
+std::string ClTemplateWriter::get_config_id()
+{
+    std::string config_id = get_name();
+    for(const auto &comp : _components)
+    {
+        config_id += "--" + comp->template_writer()->get_config_id() + "--";
+    }
+
+    return config_id;
+}
+
+CLBuildOptions ClTemplateWriter::get_build_options()
+{
+    CLBuildOptions build_opts{};
+
+    for(const auto &comp : _components)
+    {
+        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
+    }
+
+    return build_opts;
+}
+
+Window ClTemplateWriter::get_window() const
+{
+    const auto root_comp = _components.get_root_component();
+    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+    return root_comp->template_writer()->get_window();
+}
+
+std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
+{
+    // Assemble GpuKernelArguments
+    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
+    for(const auto t : _components.get_argument_tensors())
+    {
+        tensors.emplace(
+            t->id(),
+            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+    }
+    return tensors;
+}
+
+std::string ClTemplateWriter::write_code()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
+
+    // These data structures will hold the data from all the components in the blueprint
+    std::set<std::string>    headers_list{};
+    std::set<std::string>    additional_macros{};
+    std::vector<std::string> component_codes{}; // vector because order matters
+
+    // Pass 1: Declare all kernel variables
+    for(auto &component : _components)
+    {
+        component->template_writer()->declare_variables(_vtable, _components);
+    }
+    // Pass 2: Generate component codes
+    for(auto &component : _components)
+    {
+        const auto component_writer       = component->template_writer();
+        auto       curr_headers_list      = component_writer->get_headers_list();
+        auto       curr_additional_macros = component_writer->get_additional_macros();
+        auto       curr_component_code    = component_writer->get_component_code(_components);
+        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        component_codes.push_back(replace_tags(curr_component_code, var_lut));
+
+        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
+        if(!additional_macros.empty()) // Some components might not have any
+        {
+            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
+        }
+    }
+
+    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
+    std::string code = "";
+
+    for(auto &header : headers_list)
+    {
+#if defined(EMBEDDED_KERNELS)
+        code += CLKernelLibrary::get().get_program(header).first;
+#else  // defined(EMBEDDED_KERNELS)
+        code += "#include \"" + header + "\"\n";
+#endif // defined(EMBEDDED_KERNELS)
+    }
+
+    for(auto &macros : additional_macros)
+    {
+        code += macros;
+    }
+
+    code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors()));
+
+    code += "\n{\n\n";
+
+    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
+    code += write_global_section();
+    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+
+    for(const auto &component_code : component_codes)
+    {
+        code += component_code;
+    }
+
+    code += "}\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_global_section() const
+{
+    const auto dst_tensors = _components.get_dst_tensors();
+    ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
+    const auto dst_info   = dst_tensors[0];
+    const auto dst_w      = dst_info->dimension(0);
+    const auto tile_w     = std::max(1, get_window().x().step());
+    const auto tile_h     = std::max(1, get_window().y().step());
+    auto       leftover_w = dst_w % tile_w;
+
+    std::string code = "";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
+    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
+
+    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
+    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
+{
+    std::string code;
+    switch(var.kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
+        }
+    }
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
+{
+    std::string code = "\n__kernel void " + write_kernel_name() + "(";
+
+    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    {
+        code += write_argument_declaration(argument_list[i]) + ",";
+    }
+    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    {
+        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
+    }
+
+    code += ')';
+
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_name() const
+{
+    if(_components.empty())
+    {
+        return "empty_kernel";
+    }
+    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
+    for(size_t i = 1; i < _components.size(); ++i)
+    {
+        name += "___";
+        name += _components[i]->template_writer()->get_name();
+    }
+
+    return name;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
new file mode 100644
index 0000000000..83f617b6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use a templated-string-based method to write kernel code
+ *  It stitches the component code templates together based on the valid fusion configuration.
+ *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
+ */
+class ClTemplateWriter : public IGpuKernelWriter
+{
+public:
+    /** Instantiates a kernel code string from the kernel code template
+     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
+     *
+     * @param[in] code_template Kernel code template
+     * @param[in] tags          Tag lookup table
+     *
+     * @return std::string  Instantiated kernel string
+     */
+    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
+    /** Default constructor */
+    ClTemplateWriter() = default;
+    /** Constructor
+     *
+     * @param[in] components Kernel component group from which the kernel will be generated
+     */
+    ClTemplateWriter(const GpuKernelComponentGroup &components);
+    /** Destructor */
+    ~ClTemplateWriter() override;
+    /** Generate kernel name */
+    std::string get_name() override;
+    /** Generate kernel code */
+    std::string get_code() override;
+    /** Generate build options */
+    CLBuildOptions get_build_options() override;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    std::string get_config_id() override;
+    /** Generate execution window */
+    Window get_window() const override;
+    /** Get the kernel argument lists of the kernel*/
+    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+
+private:
+    std::string write_kernel_name() const;
+    std::string write_code();
+    std::string write_global_section() const;
+    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
+    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
+
+private:
+    GpuKernelComponentGroup _components{};
+    GpuKernelVariableTable  _vtable{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
new file mode 100644
index 0000000000..55eb4c5c77
--- /dev/null
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+
+#include "arm_compute/core/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+    return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+} // namespace
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph
+ *  Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ */
+class DependencyGraph
+{
+public:
+    using Id         = int32_t;
+    using TensorId   = Id;
+    using OperatorId = Id;
+    /** Adjacency list
+     *
+     */
+    using AdjList = std::map<Id, std::vector<Id>>;
+
+    /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+     *
+     */
+    struct OpPack
+    {
+        OperatorId            op{};
+        std::vector<TensorId> inputs{};
+        std::vector<TensorId> outputs{};
+        friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+        {
+            return std::make_tuple(
+                       opp0.op, opp0.inputs, opp0.outputs)
+                   == std::make_tuple(
+                       opp1.op, opp1.inputs, opp1.outputs);
+        }
+    };
+
+public:
+    DependencyGraph() = default;
+    friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &);
+
+    /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list
+     * @note The list is expected to only grow from head to tail
+     *
+     * PRECONDITION: The current graph is already linear
+     *
+     * @return true  If the operator can be added while keeping the graph as a linear sequence
+     * @return false  Otherwise
+     */
+    bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) const
+    {
+        ARM_COMPUTE_UNUSED(op, outputs);
+        if(all_ops().empty())
+        {
+            return true;
+        }
+        std::vector<TensorId> common_tensors{};
+        auto                  existing_tensors = all_tensors();
+        std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted
+        std::vector<TensorId> sorted_inputs{ inputs };
+        std::sort(sorted_inputs.begin(), sorted_inputs.end());
+        std::set_intersection(existing_tensors.begin(), existing_tensors.end(),
+                              sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors));
+        if(common_tensors.size() != 1U)
+        {
+            return false;
+        }
+        const auto linked_tensor = common_tensors[0];
+        const auto tail_ops      = get_dst_ops();
+        ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION
+        const auto tail = tail_ops[0];
+
+        if(!is_in(linked_tensor, dst_tensors(tail)))
+        {
+            return false;
+        }
+        return true;
+    }
+    /** Add an operator, while keeping the graph as a "linear sequence"
+     *
+     * PRECONDITION: The current graph is already linear
+     * INVARIANT: The list can only grow from head to tail
+     * INVARIANT: POSTCONDITION: The graph is linear
+     */
+    void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+    {
+        ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs));
+        auto success = add_operator(op, inputs, outputs);
+        ARM_COMPUTE_ERROR_ON(!success);
+    }
+    /** Add a new operator
+     *  Return invalid if it violates the DAG invariant
+     *  Invalid operation will not change the graph
+     *
+     * @param[in] op      Operator to add
+     * @param[in] inputs  Input tensors to the operator
+     * @param[in] outputs Output tensors to the operator
+     */
+    bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+    {
+        if(operator_exists(op))
+        {
+            return false;
+        }
+        _adj_src_tensors[op] = {};
+        _adj_dst_tensors[op] = {};
+        for(auto in_tensor : inputs)
+        {
+            // Linking input tensor to operator node will never create a cycle / loop because we guarantee
+            // each op is newly created, so every <input, op> pair / edge is new
+            link_input(op, in_tensor);
+        }
+        for(auto out_tensor : outputs)
+        {
+            // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
+            if(path_exists_from_tensor_to_op(out_tensor, op))
+            {
+                remove_operator(op);
+                return false;
+            }
+            else
+            {
+                link_output(op, out_tensor);
+            }
+        }
+
+        return true;
+    }
+
+    /** Sort the graph in a topological order
+     *
+     * @return std::vector<OpPack>
+     */
+    std::vector<OpPack> topological_sort() const
+    {
+        // Incident degree (number of source operators to an op)
+        std::map<OperatorId, unsigned int> in_degree{};
+        std::set<OperatorId>   visited_ops{};
+        std::deque<OperatorId> zero_in_degree_ops{};
+        std::vector<OpPack>    sorted_op_packs{};
+        for(auto op : all_ops())
+        {
+            const auto degree = src_ops(op).size();
+            in_degree[op]     = degree;
+            if(degree == 0)
+            {
+                zero_in_degree_ops.push_back(op);
+                visited_ops.insert(op);
+            }
+        }
+
+        while(!zero_in_degree_ops.empty())
+        {
+            const OperatorId op = zero_in_degree_ops.front();
+            zero_in_degree_ops.pop_front();
+            sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+
+            for(const auto next_op : dst_ops(op))
+            {
+                if(in_degree[next_op] > 0)
+                {
+                    in_degree[next_op]--;
+                }
+                if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
+                {
+                    zero_in_degree_ops.push_back(next_op);
+                    visited_ops.insert(op);
+                }
+            }
+        }
+
+        return sorted_op_packs;
+    }
+
+    void find_independent_paths_util(OperatorId op, std::vector<std::vector<OperatorId>> &paths, std::vector<OperatorId> cur_path,
+                                     const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        // We have found an unresolved dependency
+        if(in_degree.at(op) > 1)
+        {
+            paths.push_back(cur_path);
+            return;
+        }
+        const auto child_ops = dst_ops(op);
+
+        cur_path.push_back(op);
+        // Hit the leaf op
+        if(child_ops.empty())
+        {
+            paths.push_back(cur_path);
+            return;
+        }
+        for(const auto child_op : child_ops)
+        {
+            find_independent_paths_util(child_op, paths, cur_path, in_degree);
+        }
+    }
+    /** Find all independent linear paths from op, which doesn't depend on any other op
+     *
+     * @return std::vector<OpPack>
+     */
+    std::vector<std::vector<OperatorId>> find_independent_paths(OperatorId op,
+                                                                const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        std::vector<std::vector<OperatorId>> paths;
+        std::vector<OperatorId>              cur_path;
+        find_independent_paths_util(op, paths, cur_path, in_degree);
+        return paths;
+    }
+    /** Find a longest linear path from op, which doesn't depend on any other op
+     *
+     * @return std::vector<OpPack>
+     */
+    std::vector<OperatorId> find_longest_independent_path(OperatorId op,
+                                                          const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        const auto &paths = find_independent_paths(op, in_degree);
+        ARM_COMPUTE_ERROR_ON(paths.empty());
+        size_t                         max_len  = 0;
+        const std::vector<OperatorId> *max_path = nullptr;
+
+        for(const auto &path : paths)
+        {
+            if(path.size() >= max_len)
+            {
+                max_path = &path;
+                max_len  = path.size();
+            }
+        }
+        return *max_path;
+    }
+    std::vector<OperatorId> propose_next_path(std::set<OperatorId> &candidate_ops,
+                                              const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        if(candidate_ops.empty())
+        {
+            return {};
+        }
+        size_t                  max_len = 0;
+        std::vector<OperatorId> max_path;
+        OperatorId              chosen_op{};
+        for(auto op : candidate_ops)
+        {
+            const auto path = find_longest_independent_path(op, in_degree);
+            if(path.size() >= max_len)
+            {
+                chosen_op = op;
+                max_path  = path;
+                max_len   = path.size();
+            }
+        }
+        candidate_ops.erase(chosen_op);
+        return max_path;
+    }
+    /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize
+     *  the number of partitions
+     */
+    std::vector<std::vector<OpPack>> topological_partition() const
+    {
+        // Initialize zero incident degree and zero in degree ops
+        std::map<OperatorId, unsigned int> in_degree{};
+        std::set<OperatorId> candidate_ops{};
+        for(auto op : all_ops())
+        {
+            const auto degree = src_ops(op).size();
+            in_degree[op]     = degree;
+            if(degree == 0)
+            {
+                candidate_ops.insert(op);
+            }
+        }
+
+        std::vector<std::vector<OpPack>> sorted_partitions{};
+        while(!candidate_ops.empty())
+        {
+            // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops)
+            const auto path = propose_next_path(candidate_ops, in_degree);
+
+            // Append to sorted_partitions
+            std::vector<OpPack> path_op_pack{};
+            for(auto op : path)
+            {
+                path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+            }
+            sorted_partitions.push_back(path_op_pack);
+            // Remove whole path (Update in_degree, visited_ops, candidate_ops)
+            for(auto op : path)
+            {
+                for(const auto next_op_child : dst_ops(op))
+                {
+                    if(in_degree[next_op_child] > 0)
+                    {
+                        in_degree[next_op_child]--;
+                    }
+                    if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates
+                    {
+                        candidate_ops.insert(next_op_child);
+                    }
+                }
+            }
+        }
+        return sorted_partitions;
+    }
+
+    /** Strict equality comparison (all internal ids and order of insertion matter).
+     *        In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+     *
+     *
+     * @param[in] g0
+     * @param[in] g1
+     * @return true  If the same
+     * @return false Otherwise
+     */
+    friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+    {
+        // Do not compare id allocators
+        return std::make_tuple(
+                   g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops)
+               == std::make_tuple(
+                   g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+    }
+    std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
+    {
+        return _adj_src_ops.at(tensor);
+    }
+    std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const
+    {
+        return _adj_dst_ops.at(tensor);
+    }
+    /** Get all tensors
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> all_tensors() const
+    {
+        std::vector<TensorId> tensors{};
+        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
+        {
+            return it.first;
+        });
+        return tensors;
+    }
+    /** Get source tensors of the whole graph
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> global_src_tensors() const
+    {
+        std::vector<TensorId> tensors;
+        for(auto tensor_src_ops : _adj_src_ops)
+        {
+            if(tensor_src_ops.second.empty())
+            {
+                tensors.push_back(tensor_src_ops.first);
+            }
+        }
+        return tensors;
+    }
+    /** Get destination tensors of the whole graph
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> global_dst_tensors() const
+    {
+        std::vector<TensorId> tensors;
+        for(auto tensor_dst_ops : _adj_dst_ops)
+        {
+            if(tensor_dst_ops.second.empty())
+            {
+                tensors.push_back(tensor_dst_ops.first);
+            }
+        }
+        return tensors;
+    }
+    /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+     *
+     * @return std::vector<OperatorId>
+     */
+    std::vector<OperatorId> get_root_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        const auto              op_list = all_ops();
+
+        for(auto op : op_list)
+        {
+            if(src_ops(op).empty())
+            {
+                ops.emplace_back(op);
+            }
+        }
+        return ops;
+    }
+
+private:
+    void link_input(OperatorId op, TensorId in_tensor)
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        if(!tensor_exists(in_tensor))
+        {
+            insert_new_tensor(in_tensor);
+        }
+        ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking
+        _adj_src_tensors[op].push_back(in_tensor);
+        _adj_dst_ops[in_tensor].push_back(op);
+    }
+    void link_output(OperatorId op, TensorId out_tensor)
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        if(!tensor_exists(out_tensor))
+        {
+            insert_new_tensor(out_tensor);
+        }
+        ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking
+        _adj_dst_tensors[op].push_back(out_tensor);
+        _adj_src_ops[out_tensor].push_back(op);
+    }
+
+    std::vector<OperatorId> src_ops(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        std::vector<OperatorId> ops{};
+        for(TensorId src_tensor : src_tensors(op))
+        {
+            ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+        }
+        return ops;
+    }
+    std::vector<OperatorId> dst_ops(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        std::vector<OperatorId> ops{};
+        for(TensorId dst_tensor : _adj_dst_tensors.at(op))
+        {
+            ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+        }
+        return ops;
+    }
+
+    /** Get source tensors to an operator
+     *
+     * @param[in] op
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> src_tensors(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        return _adj_src_tensors.at(op);
+    }
+    /** Get destination tensors to an operator
+     *
+     * @param[in] op
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> dst_tensors(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        return _adj_dst_tensors.at(op);
+    }
+    /** Get all operators
+     *
+     * @return std::vector<OperatorId>
+     */
+    std::vector<OperatorId> all_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
+        {
+            return it.first;
+        });
+        return ops;
+    }
+    /** Remove an operator from graph.
+     *
+     * @param[in] op
+     */
+    void remove_operator(OperatorId op)
+    {
+        for(auto src_tensor : _adj_src_tensors.at(op))
+        {
+            auto &dst_ops = _adj_dst_ops.at(src_tensor);
+            dst_ops.erase(
+                std::remove(std::begin(dst_ops), std::end(dst_ops), op),
+                std::end(dst_ops));
+        }
+        for(auto dst_tensor : _adj_dst_tensors.at(op))
+        {
+            auto &src_ops = _adj_src_ops.at(dst_tensor);
+            src_ops.erase(
+                std::remove(std::begin(src_ops), std::end(src_ops), op),
+                std::end(src_ops));
+        }
+        // Remove any isolated tensors
+        // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
+        for(auto t : all_tensors())
+        {
+            if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+            {
+                _adj_src_ops.erase(t);
+                _adj_dst_ops.erase(t);
+            }
+        }
+        _adj_src_tensors.erase(op);
+        _adj_dst_tensors.erase(op);
+    }
+    void insert_new_tensor(TensorId tensor)
+    {
+        _adj_src_ops[tensor] = {};
+        _adj_dst_ops[tensor] = {};
+    }
+    bool tensor_exists(TensorId tensor) const
+    {
+        return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+    }
+    bool operator_exists(OperatorId op) const
+    {
+        return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+    }
+    bool is_src_tensor_of(OperatorId op, TensorId tensor) const
+    {
+        if(!operator_exists(op) || !tensor_exists(tensor))
+        {
+            return false;
+        }
+        const auto op_inputs = src_tensors(op);
+        return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+    }
+    bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
+    {
+        if(!operator_exists(op) || !tensor_exists(tensor))
+        {
+            return false;
+        }
+        const auto op_outputs = dst_tensors(op);
+        return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+    }
+    bool are_connected(OperatorId op, TensorId tensor) const
+    {
+        return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+    }
+    /** If op is the destination / leaf operator of the whole graph
+     *
+     * @param[in] op
+     * @return true
+     * @return false
+     */
+    bool is_dst_op(OperatorId op) const
+    {
+        return dst_ops(op).empty();
+    }
+    std::vector<OperatorId> get_dst_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        const auto              op_list = all_ops();
+
+        for(auto op : op_list)
+        {
+            if(is_dst_op(op))
+            {
+                ops.emplace_back(op);
+            }
+        }
+        return ops;
+    }
+    bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
+    {
+        if(!tensor_exists(src_tensor) || !operator_exists(dst_op))
+        {
+            return false;
+        }
+        for(auto child_op : dst_ops_from_tensor(src_tensor))
+        {
+            if(path_exists_from_op_to_op(child_op, dst_op))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
+    {
+        if(!operator_exists(src_op) || !operator_exists(dst_op))
+        {
+            return false;
+        }
+        if(src_op == dst_op)
+        {
+            return true;
+        }
+        if(is_in(src_op, get_dst_ops()))
+        {
+            return false;
+        }
+        for(auto child_tensor : dst_tensors(src_op))
+        {
+            if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+private:
+    AdjList _adj_src_tensors{};
+    AdjList _adj_dst_tensors{};
+    AdjList _adj_src_ops{};
+    AdjList _adj_dst_ops{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */
diff --git a/tests/SConscript b/tests/SConscript
index b848f27043..95ecd27afa 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -121,7 +121,7 @@ files_validation += Glob('validation/CPP/' + filter_pattern)
 if env['opencl']:
     if env['experimental_dynamic_fusion']:
         test_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
-        files_validation += Glob('validation/CL/UNIT/dynamic_fusion/' + filter_pattern)
+        files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern)
 
     filter_pattern = test_env['test_filter']
 
@@ -291,19 +291,6 @@ if test_env['benchmark_examples']:
                 prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
                 arm_compute_benchmark_examples += [ prog ]
 
-        # Dynamic fusion examples
-        if env['opencl']:
-            if env['experimental_dynamic_fusion']:
-                for file in Glob("%s/dynamic_fusion/*.cpp" % examples_folder):
-                    example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0])
-                    if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+[load_whole_archive, arm_compute_lib, noload_whole_archive] + bm_link_flags + extra_link_flags)
-                        arm_compute_benchmark_examples += [ prog ]
-                    else:
-                        #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
-                        arm_compute_benchmark_examples += [ prog ]
-
     arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples)
     Depends(arm_compute_benchmark_examples, arm_compute_test_framework)
     Depends(arm_compute_benchmark_examples, arm_compute_lib)
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
new file mode 100644
index 0000000000..87720a629d
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Conv2d, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d1x1(direct_conv)(input, weights, bias)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Fuse conv2d
+    Conv2dAttributes conv2d_attr{};
+    auto             input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    auto             weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    auto             dst_info    = sketch.create_tensor_info();
+    GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for(auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = data.first;
+        AuxMemoryInfo aux_mem_req = data.second;
+        tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_input{};
+    CLTensor t_weight{};
+    CLTensor t_dst{};
+
+    // Initialize user tensors
+    t_input.allocator()->init(input_info);
+    t_weight.allocator()->init(weight_info);
+    t_dst.allocator()->init(dst_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_input.allocator()->allocate();
+    t_weight.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    fill<float>(CLAccessor(t_weight), 1, library.get());
+
+    // Run runtime
+    runtime.run({ &t_input, &t_weight, &t_dst });
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+    fill<float>(ref_t_weight, 1, library.get());
+
+    auto ref_t_input_nchw            = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto ref_t_weight_nchw           = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+    auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw            = t_dst_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left, conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+                                    DimensionRoundingType{});
+    auto       ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
+    const auto ref_t_dst      = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+TEST_SUITE(Invalid_Fusion_Should_Fail)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+    auto       t_input_info   = TensorInfo(t_input_shape, 1, data_type, data_layout);
+    auto       t_weight_info  = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+    auto       t_dst_info     = TensorInfo();
+
+    Conv2dAttributes conv2d_attr{};
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Create tensor infos
+    auto input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    auto weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    auto dst_info    = sketch.create_tensor_info();
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator
+        const auto success = GpuConv2d::validate_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+        ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
+
+        GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+    }
+
+    // Create tensor infos
+    auto weight_info_2 = sketch.create_tensor_info(t_weight_info);
+    auto dst_info_2    = sketch.create_tensor_info();
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator, should fail
+        const auto success = GpuConv2d::validate_op(sketch, &dst_info, &weight_info_2, nullptr, &dst_info_2, conv2d_attr);
+        ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+    }
+}
+TEST_SUITE_END() // Invalid_Fusion_Should_Fail
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
author	SiCong Li <sicong.li@arm.com>	2022-08-29 18:25:51 +0100
committer	SiCong Li <sicong.li@arm.com>	2022-11-01 10:38:21 +0000
commit	f44bbc5c697de841dce97c0f2fa39bae391a8174 (patch)
tree	56468ef833726318e545043f4abcd16ad3775094
parent	3394f3e3df7fd2d924c41822a8564493fc06473a (diff)
download	ComputeLibrary-f44bbc5c697de841dce97c0f2fa39bae391a8174.tar.gz