62 files changed, 7068 insertions, 46 deletions
diff --git a/Android.bp b/Android.bp
index 8c6d700062..d02d436fa0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -592,6 +592,24 @@ cc_library_static {
         "src/cpu/operators/CpuTranspose.cpp",
         "src/cpu/operators/CpuWinogradConv2d.cpp",
         "src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
+        "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+        "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
+        "src/dynamic_fusion/sketch/OperatorAttributes.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
         "src/gpu/cl/ClContext.cpp",
         "src/gpu/cl/ClKernelLibrary.cpp",
         "src/gpu/cl/ClQueue.cpp",
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index e8f2ff35da..60e0f95f83 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -310,6 +310,12 @@ public:
      */
     int32_t get_ddk_version() const;
 
+    /** Return the Gpu target of the associated device
+     *
+     * @return GPUTarget
+     */
+    GPUTarget get_gpu_target() const;
+
 private:
     /** Load program and its dependencies.
      *
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index 6839d697e3..ca2837e450 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,6 +41,11 @@ class ITensorInfo : public misc::ICloneable<ITensorInfo>
 {
 public:
     using TensorDimsState = std::vector<int>;
+    /** An id that uniquely identifies an ITensorInfo within some domain (e.g. a workload)
+     */
+    using Id = int32_t;
+    /** An invalid tensor id within a domain */
+    static constexpr Id invalid_tensor_id = -1;
     /** Get the value representing dynamic dimension state
      *
      * @return Value representing dynamic dimension state
@@ -280,7 +285,20 @@ public:
     * @return A DataLayout containing the layout data information.
     */
     virtual DataLayout data_layout() const = 0;
-
+    /** Get the workload tensor id of the tensor.
+    *
+    * @return Workload tensor id of the tensor
+    */
+    virtual Id id() const = 0;
+    /** Set the tensor id
+    */
+    virtual ITensorInfo &set_id(ITensorInfo::Id id) = 0;
+    /** Check if the tensor id is valid
+     */
+    bool has_valid_id() const
+    {
+        return id() != invalid_tensor_id;
+    }
     /** If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of
      * the broadcasted valid regions of the tensors.
      *
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 54836d0528..374ea5b8c6 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -237,6 +237,17 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->data_layout();
     }
+    ITensorInfo::Id id() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->id();
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_id(id);
+        return *this;
+    }
 
 private:
     ITensorInfo    *_parent;
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 40f9ed9806..7eb8c52d07 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -50,7 +50,7 @@ public:
     /** Allow instances of this class to be copy constructed */
     TensorInfo(const ITensorInfo &info);
     /** Allow instances of this class to be copy constructed */
-    TensorInfo(const TensorInfo &) = default;
+    TensorInfo(const TensorInfo &);
     /** Allow instances of this class to be copied */
     TensorInfo &operator=(const TensorInfo &) = default;
     /** Allow instances of this class to be move constructed */
@@ -297,6 +297,15 @@ public:
         _are_values_constant = are_values_constant;
         return *this;
     }
+    ITensorInfo::Id id() const override
+    {
+        return _id;
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        _id = id;
+        return *this;
+    }
     inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
 
 private:
@@ -320,6 +329,7 @@ private:
     QuantizationInfo _quantization_info;
     DataLayout       _data_layout;
     bool             _are_values_constant;
+    ITensorInfo::Id  _id;
 };
 
 /** Check whether two tensor info are equal.
@@ -334,7 +344,8 @@ inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
     return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
            && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
            && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
-           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant);
+           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant)
+           && (lhs._id == rhs._id);
 }
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index fc6d46c53b..b0a6475527 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -782,6 +782,20 @@ private:
     DimensionRoundingType _round_type;
 };
 
+/** Padding information for 2D operations like Conv2d */
+struct Padding2D
+{
+    Padding2D() = default;
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom)
+        : left(left), right(right), top(top), bottom(bottom)
+    {
+    }
+    size_t left   = { 0 }; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = { 0 }; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = { 0 }; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = { 0 }; /**<  Padding across the height dimension on the bottom, in elements. */
+};
+
 /** Padding information for 3D operations like Conv3d */
 struct Padding3D
 {
@@ -1642,7 +1656,7 @@ public:
         LINEAR,          /**< Linear ( \f$ f(x)= ax + b \f$ ) */
         IDENTITY,        /**< Identity ( \f$ f(x)= x \f$ ) */
         HARD_SWISH,      /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
-        SWISH,            /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
+        SWISH,           /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
         GELU             /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
     };
 
diff --git a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
new file mode 100644
index 0000000000..326880f721
--- /dev/null
+++ b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME
+#define ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLTensor;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** OpenCL runtime to run a workload
+ */
+class ClWorkloadRuntime
+{
+public:
+    ClWorkloadRuntime();
+    ~ClWorkloadRuntime();
+    /** Configure @ref ClWorkloadRuntime
+     * @note A runtime cannot be re-configured
+     *
+     * @param[in] sketch @ref GpuWorkloadSketch with which to configure
+     */
+    Status configure(const GpuWorkloadSketch &sketch);
+    /** Perform run workload
+     * @note If the runtime is not configured, this method will not perform any action
+     *
+     * @param[in,out] tensors Tensors required by the run workloads
+     *
+     * @return Status If the run is successful
+     */
+    Status run(const std::vector<CLTensor *> &tensors);
+    /** Get auxiliary tensors of the workload and their memory requirement
+     */
+    std::vector<std::pair<CLTensor *, AuxMemoryInfo>> get_auxiliary_tensors();
+
+private:
+    /** Enqueue prepare workload
+     * @note If the runtime is not configured, this method will not perform any action
+     */
+    void prepare();
+    struct Implementation;
+    std::unique_ptr<Implementation> _impl;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME */
diff --git a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
new file mode 100644
index 0000000000..deedf62262
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Type of memory used by a workload tensor */
+enum class MemoryType
+{
+    User      = 0, /**< Memory coming directly from users, e.g. for argument tensors */
+    Auxiliary = 1, /**< Additional memory required by the workload tensor, e.g. for temporary tensors */
+};
+
+/** Memory information for tensors with @ref MemoryType::Auxiliary.
+ * This informs how much additional memory is required for auxiliary tensors
+ */
+struct AuxMemoryInfo
+{
+    AuxMemoryInfo() = default;
+
+    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
+        : size(size),
+          alignment(alignment)
+    {
+    }
+
+    friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
+    {
+        return info0.size == info1.size && info0.alignment == info1.alignment;
+    }
+    size_t size{ 0 };      /**< Total memory size in bytes */
+    size_t alignment{ 0 }; /**< Memory alignment in bytes */
+};
+
+/** Descriptor of a workload tensor memory */
+struct MemoryDescriptor
+{
+    MemoryType    memory_type{};     /**< Memory Type*/
+    AuxMemoryInfo aux_memory_info{}; /**< Auxiliary Tensor Memory Information */
+};
+
+/** A map from @ref ITensorInfo to their corresponding @ref MemoryDescriptor */
+using MemoryDescriptorMap = std::map<ITensorInfo::Id, MemoryDescriptor>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR */
diff --git a/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h b/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h
new file mode 100644
index 0000000000..22c6772926
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** Conv2d attributes */
+class Conv2dAttributes
+{
+public:
+    /** Set padding */
+    Conv2dAttributes &pad(const Padding2D &pad);
+    /** Get padding */
+    Padding2D pad() const;
+    /** Set stride */
+    Conv2dAttributes &stride(const Size2D &stride);
+    /** Get stride */
+    Size2D stride() const;
+    /** Set dilation */
+    Conv2dAttributes &dilation(const Size2D &dilation);
+    /** Get dilation */
+    Size2D dilation() const;
+
+private:
+    Padding2D _pad{};              /**< Padding */
+    Size2D    _stride{ 1U, 1U };   /**< Stride */
+    Size2D    _dilation{ 1U, 1U }; /**< Dilation */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
new file mode 100644
index 0000000000..1ee3c7e3ec
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT
+
+#include "arm_compute/core/GPUTarget.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLCompileContext;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Gpu Information such as the Gpu target (for example, G76) */
+using GpuTarget = ::arm_compute::GPUTarget;
+
+/** Gpu Language */
+enum class GpuLanguage
+{
+    OpenCL,
+    Unknown
+};
+/** Provide context necessary for the creation and configuration of a workload
+ * e.g. gpu targets and capabilities, cl::Device for querying OpenCl extensions. Both can affect how a kernel is generated
+ *
+ * This context is shared between different operators within a sketch, and has to stay valid for the entire workload creation session.
+ * This context may also be shared between different sketches.
+ *
+ * This class only contains information for workload creation, but not for runtime (e.g. cl::Queue for enqueueing the kernels)
+ */
+class GpuWorkloadContext
+{
+public:
+    /** Constructor */
+    GpuWorkloadContext(CLCompileContext *cl_compile_context);
+    /** Allow instances of this class to be copy constructed */
+    GpuWorkloadContext(const GpuWorkloadContext &config) = default;
+    /** Allow instances of this class to be copied */
+    GpuWorkloadContext &operator=(const GpuWorkloadContext &config) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuWorkloadContext(GpuWorkloadContext &&config) = default;
+    /** Allow instances of this class to be moved */
+    GpuWorkloadContext &operator=(GpuWorkloadContext &&config) = default;
+    /** Get @ref GpuLanguage of the context */
+    GpuLanguage gpu_language() const;
+    /** Get @ref GpuTarget of the context */
+    GpuTarget gpu_target() const;
+    /** Get @ref CLCompileContext
+     * If the gpu language is not OpenCL, then return nullptr
+     */
+    const CLCompileContext *cl_compile_context() const;
+
+private:
+    GpuLanguage       _gpu_language{ GpuLanguage::Unknown };
+    CLCompileContext *_cl_compile_ctx{ nullptr };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
new file mode 100644
index 0000000000..afbe2b8d0b
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A descriptor of a workload of operators
+ *
+ * A "workload" is a basic unit of computation to schedule and perform. It contains one or more operators that can be "fused" together.
+ * Note that a workload may still contain multiple kernels.
+ */
+class GpuWorkloadSketch
+{
+public:
+    /** Global context used for the creation of a workload */
+    using Context = GpuWorkloadContext;
+    /** Internal opaque implementation */
+    class Implementation;
+
+public:
+    /** Constructor
+     *
+     * @param[in] context Gpu context for the creation of a workload
+     */
+    explicit GpuWorkloadSketch(GpuWorkloadContext *context);
+    /** Destructor */
+    ~GpuWorkloadSketch();
+    /** Get the implementation */
+    Implementation &implementation();
+    /** Get the implementation */
+    const Implementation &implementation() const;
+    /** Get the gpu workload context of this sketch */
+    const GpuWorkloadContext *gpu_context() const;
+    /** Create a @ref TensorInfo associated with the workload sketch.
+     *
+     * @return TensorInfo   Newly created tensor info
+     */
+    template <typename... Args>
+    TensorInfo create_tensor_info(Args &&... args)
+    {
+        auto tensor_info = TensorInfo(std::forward<Args>(args)...);
+        tensor_info.set_id(allocate_new_tensor_id());
+        return tensor_info;
+    }
+    /** Create a @ref TensorInfo associated with the workload sketch by copying from an existing tensor info
+     * @note The newly copied tensor will have a different identity within the workload than the one copied from
+     *       To copy the identity of @p tensor_info as well, use @ref TensorInfo 's copy constructors instead
+     *
+     * @param[in] tensor_info @ref ITensorInfo to copy from
+     *
+     * @return TensorInfo   Newly created tensor info
+     */
+    TensorInfo create_tensor_info(const ITensorInfo &tensor_info);
+    /** Create a default @ref TensorInfo associated with the workload sketch
+     * It is usually used by a destination tensor whose @ref ITensorInfo is to be inferred automatically
+     *
+     * @return TensorInfo   Newly created tensor info
+     */
+    TensorInfo create_tensor_info();
+
+private:
+    ITensorInfo::Id                 allocate_new_tensor_id();
+    std::unique_ptr<Implementation> _impl; /**< Internal opaque implementation*/
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
new file mode 100644
index 0000000000..fe9108d356
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuConv2d final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = Conv2dAttributes;
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |wei            |bia            |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor
+     * @param[in]     wei        Weight tensor
+     * @param[in]     bia        (Optional) Bias tensor
+     * @param[out]    dst        Destination tensor
+     * @param[in]     attributes Operator attributes
+     */
+    static void create_op(GpuWorkloadSketch &sketch,
+                          ITensorInfo       *src,
+                          ITensorInfo       *wei,
+                          ITensorInfo       *bia,
+                          ITensorInfo       *dst,
+                          const Attributes &attributes);
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     * Similar to @ref GpuConv2d::create_op()
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const ITensorInfo       *dst,
+                              const Attributes        &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D */
diff --git a/examples/SConscript b/examples/SConscript
index 7d9324ca9a..fd6b591891 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -99,15 +99,6 @@ if env['opencl']:
         prog = install_bin(prog)
         alias = examples_env.Alias(example, prog)
         Default(alias)
-    if env['experimental_dynamic_fusion']:
-        examples_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
-        for file in Glob("./dynamic_fusion/*.cpp"):
-            example = os.path.basename(os.path.splitext(str(file))[0])
-            prog = examples_env.Program(example, ["./dynamic_fusion/{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs)
-            Depends(prog, arm_compute_dependency)
-            prog = install_bin(prog)
-            alias = examples_env.Alias(example, prog)
-            Default(alias)
 
 if env['gemm_tuner'] and env['opencl']:
     gemm_tuner_common_options = examples_env.Object("./gemm_tuner/CommonGemmExampleOptions.cpp")
diff --git a/filelist.json b/filelist.json
index 431979ec41..4603932638 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2115,20 +2115,25 @@
   },
   "experimental": {
     "dynamic_fusion": [
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp",
-      "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
-      "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp",
+      "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+      "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
 
-      "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp",
-      "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp",
-      "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp"
+      "src/dynamic_fusion/sketch/OperatorAttributes.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+      "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+      "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
+      "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp"
     ]
   }
 }
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index fce8798b48..ea03d59fc2 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -232,7 +232,7 @@ void CLCompileContext::set_context(cl::Context context)
 std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
 {
     std::string concat_str;
-    bool ext_supported = false;
+    bool        ext_supported = false;
     std::string ext_buildopts;
 
 #if defined(ARM_COMPUTE_DEBUG_ENABLED)
@@ -399,4 +399,8 @@ int32_t CLCompileContext::get_ddk_version() const
 
     return -1;
 }
+GPUTarget CLCompileContext::get_gpu_target() const
+{
+    return _device.target();
+}
 } // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index d52b105507..224b68af70 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -44,7 +44,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-struct TensorBinding;
 struct ClExecutionDescriptor;
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index e441ddb3a2..12f79444c6 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ namespace arm_compute
 {
 TensorInfo::TensorInfo()
     : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true)
+      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id)
 {
 }
 
@@ -56,8 +56,28 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
     _quantization_info             = info.quantization_info();
     _data_layout                   = info.data_layout();
     _are_values_constant           = info.are_values_constant();
+    _id                            = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied
 }
 
+TensorInfo::TensorInfo(const TensorInfo &info)
+    : TensorInfo()
+{
+    _total_size                    = info.total_size();
+    _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+    _strides_in_bytes              = info.strides_in_bytes();
+    _num_channels                  = info.num_channels();
+    _tensor_shape                  = info.tensor_shape();
+    _dims_state                    = info.tensor_dims_state();
+    _data_type                     = info.data_type();
+    _format                        = info.format();
+    _is_resizable                  = info.is_resizable();
+    _valid_region                  = info.valid_region();
+    _padding                       = info.padding();
+    _quantization_info             = info.quantization_info();
+    _data_layout                   = info.data_layout();
+    _are_values_constant           = info.are_values_constant();
+    _id                            = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied
+}
 TensorInfo::TensorInfo(Format format)
     : TensorInfo(TensorShape(), format)
 {
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
new file mode 100644
index 0000000000..93fbdfed63
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClKernelRuntime.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
+
+#include "support/Cast.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using namespace arm_compute::opencl;
+
+void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
+{
+    // Create kernel from kernel source string
+    opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
+                                                                                      "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */,
+                                                                                      code.code(),
+                                                                                      klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                                                                                      code.build_options().options(),
+                                                                                      false /* Is source binary */));
+
+    // Configure execution window
+    IClKernel::configure_internal(code.window());
+
+    // Set config id for lws tuning
+    _config_id = code.config_id();
+
+    // Set kernel arguments
+    _arguments = code.arguments();
+}
+
+inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
+{
+    switch(arg.type)
+    {
+        case GpuKernelArgumentInfo::Type::Scalar:
+        {
+            ARM_COMPUTE_ERROR("Unsupported yet");
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            add_1D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            add_2D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D:
+        {
+            add_2D_tensor_argument(idx, tensor, arg_slice);
+            const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        {
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
+            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+            cl_images.push_back(tensor_image2d);
+            _kernel.setArg(idx++, tensor_image2d);
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            add_2D_tensor_argument(idx, tensor, arg_slice);
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
+            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+            cl_images.push_back(tensor_image2d);
+            _kernel.setArg(idx++, tensor_image2d);
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Tensor_3D:
+        {
+            add_3D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+
+        case GpuKernelArgumentInfo::Type::Tensor_4D:
+        {
+            add_4D_tensor_argument(idx, tensor, arg_slice);
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            add_4d_tensor_nhwc_argument(idx, tensor);
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            const size_t image_w        = tensor->info()->dimension(0) / 4;
+            const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
+            const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
+
+            cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
+                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+            cl_images.push_back(tensor_image2d);
+
+            _kernel.setArg(idx++, tensor_image2d);
+            add_4d_tensor_nhwc_argument(idx, tensor);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported");
+        }
+    }
+}
+
+void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+    // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
+    // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+    Window slice_fixed_z = slice;
+    slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
+    /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
+    constexpr bool slide_along_dimz     = true;
+    constexpr bool skip_sliding_window  = false;
+    constexpr bool use_dummy_work_items = false;
+
+    unsigned int idx = 0;
+    do
+    {
+        // Set kernel arguments
+        Window arg_slice = slice;
+        // CLImages created from tensor arguments. Need to be retained until enqueue
+        std::vector<cl::Image2D> cl_images;
+        for(auto id_arg : _arguments)
+        {
+            const auto arg    = id_arg.second;
+            auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+            if(!slide_along_dimz)
+            {
+                // The stride_z for matrix must be zero if we do not slice
+                ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
+                arg_slice = slice_fixed_z;
+            }
+            add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images);
+        }
+
+        // Dispatch kernel
+        enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
+    }
+    while(skip_sliding_window && window.slide_window_slice_3D(slice));
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
new file mode 100644
index 0000000000..acc2380031
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+#define SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct GpuKernelSourceCode;
+
+/** OpenCL runtime to run a single kernel */
+class ClKernelRuntime final : public opencl::IClKernel
+{
+public:
+    /** Configure the kernel runtime
+     *
+     * @param[in] compile_ctx OpenCL compile context
+     * @param[in] code        Kernel source code
+     */
+    void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code);
+    /** Run the kernel
+     *
+     * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories
+     * @param[in]     window  Execution window
+     * @param[in]     queue   OpenCL command queue
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    /** Set a kernel tensor argument
+     *
+     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     arg       Kernel argument descriptor accompanying @p tensor
+     * @param[in]     tensor    Tensor to set as an argument of the object's kernel
+     * @param[in]     arg_slice Window the kernel will be run on
+     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+     */
+    inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+
+private:
+    GpuKernelArgumentList _arguments{}; /** All kernel arguments required by the runtime */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME */
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
new file mode 100644
index 0000000000..549c6d4abb
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
+ */
+class ClAuxTensors
+{
+public:
+    /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
+     */
+    struct DataView
+    {
+        DataView() = default;
+        DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+            : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+        {
+        }
+        ~DataView()                     = default;
+        DataView(const DataView &other) = default;
+        DataView &operator=(const DataView &other) = default;
+        DataView(DataView &&other)                 = default;
+        DataView &operator=(DataView &&other) = default;
+        CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
+        TensorInfo    tensor_info{}; /**< Associated tensor info */
+        AuxMemoryInfo memory_info{}; /**< Memory requirement */
+    };
+
+    /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
+    std::vector<DataView> get_tensors()
+    {
+        return _tensors;
+    }
+    std::vector<DataView> get_tensors() const
+    {
+        return _tensors;
+    }
+
+    friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
+
+private:
+    /** Add auxiliary tensor.
+     *
+     * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
+     * @param[in] memory_info Memory requirements of the auxiliary tensor
+     *
+     * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr
+     */
+    CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
+    {
+        const auto t_id             = tensor_info.id();
+        auto       find_tensor_pair = _owned_tensors.find(t_id);
+        if(find_tensor_pair == _owned_tensors.end())
+        {
+            return find_tensor_pair->second.get();
+        }
+        else
+        {
+            auto tensor        = std::make_unique<CLTensor>();
+            auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
+            auto new_tensor    = inserted_pair->second.get();
+            _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
+            return new_tensor;
+        }
+    }
+
+    std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
+    std::vector<DataView> _tensors{};
+};
+/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
+ *
+ * @note This is the only recommended method for user to create @ref ClAuxTensors
+ *
+ * @param[out] aux_tensors Auxiliary tensors required by the workload code
+ * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to
+ *
+ * @return Status
+ */
+Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
+{
+    for(auto t_id : code.tensors())
+    {
+        // Get tensor object
+        const auto workload_arg  = code.query_tensor(t_id);
+        ICLTensor *tensor_object = nullptr;
+        if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        {
+            // Create aux tensor CLTensor object
+            const TensorInfo tensor_info = *workload_arg->tensor_info();
+            ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
+            const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
+            tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
+        }
+        if(tensor_object == nullptr)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
+        }
+    }
+    return Status{};
+}
+
+/** A fast tensor lookup table for runtime tensor objects retrieval
+ */
+class ClTensorLUT
+{
+public:
+    /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        auto tensor_pack = _tensor_packs.find(uwk_id);
+        if(tensor_pack != _tensor_packs.end())
+        {
+            return &(tensor_pack->second);
+        }
+        return nullptr;
+    }
+    /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        return _tensor_packs.at(uwk_id);
+    }
+
+    friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);
+
+private:
+    /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack
+     * @param[in] tensor_pack Tensor pack to be added
+     */
+    void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
+    {
+        _tensor_packs[uwk_id] = tensor_pack;
+    }
+    std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
+};
+
+/** Create a fast tensor lookup table for runtime tensor retrieval
+ *
+ * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
+ * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to
+ * @param[in]  user_tensors User tensors
+ * @param[in]  aux_tensors  Auxiliary tensors required by the workload code
+ *
+ * @return Status
+ */
+Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
+{
+    // Combine user tensors and aux tensors
+    std::map<ITensorInfo::Id, CLTensor *> tensor_map;
+    for(auto tensor : user_tensors)
+    {
+        const auto t_id = tensor->info()->id();
+        if(tensor_map.find(t_id) != tensor_map.end())
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+        }
+        tensor_map[t_id] = tensor;
+    }
+    for(const auto &data : aux_tensors.get_tensors())
+    {
+        const auto t_id   = data.tensor_info.id();
+        const auto tensor = data.tensor;
+        if(tensor_map.find(t_id) != tensor_map.end())
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+        }
+        tensor_map[t_id] = tensor;
+    }
+
+    // Add tensor objects into corresponding tensor packs
+    for(auto id_tensor : tensor_map)
+    {
+        const auto t_id          = id_tensor.first;
+        const auto tensor_object = id_tensor.second;
+        if(tensor_object == nullptr)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+        }
+        if(tensor_object->allocator()->info().total_size() == 0U)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
+        }
+
+        for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        {
+            ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
+            if(tensor_pack == nullptr)
+            {
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
+            }
+            else
+            {
+                tensor_pack->add_tensor(t_id, tensor_object);
+            }
+        }
+    }
+    return Status{};
+}
+
+} // namespace
+
+struct ClWorkloadRuntime::Implementation
+{
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
+    bool                  _is_configured{ false };
+    bool                  _is_prepared{ false };
+    ClTensorLUT           _tensor_lut{};
+    ClAuxTensors          _aux_tensors{};
+    GpuWorkloadSourceCode _source_code{};
+};
+
+ClWorkloadRuntime::ClWorkloadRuntime()
+    : _impl{ std::make_unique<Implementation>() }
+{
+}
+
+ClWorkloadRuntime::~ClWorkloadRuntime() = default;
+
+Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    // Generate source code
+    _impl->_source_code = sketch.implementation().generate_source_code();
+    // Configure unit workload from source code
+    for(auto uwk_id : _impl->_source_code.unit_workloads())
+    {
+        const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
+        const auto stage = work.stage().stage;
+        auto       k     = std::make_unique<ClKernelRuntime>();
+        k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
+
+        switch(stage)
+        {
+            case UnitWorkloadStage::Stage::Run:
+                _impl->_kernels.emplace(work.id(), std::move(k));
+                break;
+            case UnitWorkloadStage::Stage::Prepare:
+                _impl->_kernels_prep.emplace(work.id(), std::move(k));
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Invalid unit workload stage");
+        }
+        break;
+    }
+    // Create auxiliary tensor objects
+    create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
+    _impl->_is_configured = true;
+    return Status{};
+}
+
+void ClWorkloadRuntime::prepare()
+{
+    if(!_impl->_is_prepared)
+    {
+        for(auto &id_kernel_pair : _impl->_kernels_prep)
+        {
+            const bool flush_queue = false;
+            const auto uwk_id      = id_kernel_pair.first;
+            auto       kernel      = id_kernel_pair.second.get();
+            CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+        }
+
+        _impl->_is_prepared = true;
+    }
+}
+
+Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
+{
+    // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
+    // in which case the lut can be cached during prepare
+    const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    prepare();
+    for(auto &id_kernel_pair : _impl->_kernels)
+    {
+        // Flush the command queue on the last kernel
+        const bool flush_queue = false;
+        const auto uwk_id      = id_kernel_pair.first;
+        auto       kernel      = id_kernel_pair.second.get();
+        CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+    }
+    return Status{};
+}
+
+std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
+{
+    std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
+    for(const auto &data : _impl->_aux_tensors.get_tensors())
+    {
+        aux_tensors.emplace_back(data.tensor, data.memory_info);
+    }
+    return aux_tensors;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
new file mode 100644
index 0000000000..f118d7d851
--- /dev/null
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+#define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+
+#include "arm_compute/core/experimental/Types.h"
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types
+ * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor
+ *
+ * The argument id is the position of the argument within the pack, and is represented by @ref TensorType
+ *
+ * @tparam T Tensor-related type
+ */
+template <typename T>
+class ArgumentPack
+{
+public:
+    /** @ref TensorType encodes the position of a tensor argument within the pack */
+    using Id = TensorType;
+    /** A single argument element within the pack
+     * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time
+     */
+    struct PackElement
+    {
+        PackElement()                        = default;
+        PackElement(const PackElement &elem) = default;
+        PackElement &operator=(const PackElement &elem) = default;
+        PackElement(PackElement &&elem)                 = default;
+        PackElement &operator=(PackElement &&elem) = default;
+        PackElement(Id id, T *tensor)
+            : id(id), tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(Id id, const T *ctensor)
+            : id(id), tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        Id       id{ ACL_UNKNOWN }; /**< Argument id within the pack */
+        T       *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */
+        const T *ctensor
+        {
+            nullptr
+        }; /**< Const pointer to tensor-related object */
+    };
+
+public:
+    /** Default constructor */
+    ArgumentPack() = default;
+    /** Destructor */
+    ~ArgumentPack() = default;
+    /** Allow instances of this class to be copy constructed */
+    ArgumentPack<T>(const ArgumentPack<T> &other) = default;
+    /** Allow instances of this class to be copied */
+    ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default;
+    /** Allow instances of this class to be move constructed */
+    ArgumentPack<T>(ArgumentPack<T> &&other) = default;
+    /** Allow instances of this class to be moved */
+    ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
+    /** Initializer list Constructor */
+    ArgumentPack(const std::initializer_list<PackElement> &l)
+        : _pack{}
+    {
+        for(const auto &e : l)
+        {
+            _pack[e.id] = e;
+        }
+    }
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(Id id, T *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_const_tensor(Id id, const T *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Get tensor of a given id from the pack
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    T *get_tensor(Id id)
+    {
+        auto it = _pack.find(id);
+        return it != _pack.end() ? it->second.tensor : nullptr;
+    }
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor (const or not) if exist else nullptr
+     */
+    const T *get_const_tensor(Id id) const
+    {
+        auto it = _pack.find(id);
+        if(it != _pack.end())
+        {
+            return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+        }
+        return nullptr;
+    }
+    /** Remove the tensor stored with the given id
+     *
+     * @param[in] id ID of tensor to remove
+     */
+    void remove_tensor(Id id)
+    {
+        _pack.erase(id);
+    }
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const
+    {
+        return _pack.size();
+    }
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const
+    {
+        return _pack.empty();
+    }
+    /** Get the ACL_SRC_* tensors
+     *
+     * @return std::vector<T *>
+     */
+    std::vector<T *> get_src_tensors()
+    {
+        std::vector<T *> src_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the const ACL_SRC_* tensors
+     *
+     * @return std::vector<const T *>
+     */
+    std::vector<const T *> get_const_src_tensors() const
+    {
+        std::vector<const T *> src_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_const_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the ACL_DST_* tensors
+     *
+     * @return std::vector<T *>
+     */
+    std::vector<T *> get_dst_tensors()
+    {
+        std::vector<T *> dst_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+    /** Get the const ACL_DST_* tensors
+     *
+     * @return std::vector<const T *>
+     */
+    std::vector<const T *> get_const_dst_tensors() const
+    {
+        std::vector<const T *> dst_tensors{};
+        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_const_tensor(static_cast<TensorType>(id));
+            if(tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+
+private:
+    std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK */
diff --git a/src/dynamic_fusion/sketch/OperatorAttributes.cpp b/src/dynamic_fusion/sketch/OperatorAttributes.cpp
new file mode 100644
index 0000000000..51ec444587
--- /dev/null
+++ b/src/dynamic_fusion/sketch/OperatorAttributes.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad)
+{
+    _pad = pad;
+    return *this;
+}
+Padding2D Conv2dAttributes::pad() const
+{
+    return _pad;
+}
+Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride)
+{
+    _stride = stride;
+    return *this;
+}
+Size2D Conv2dAttributes::stride() const
+{
+    return _stride;
+}
+Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation)
+{
+    _dilation = dilation;
+    return *this;
+}
+Size2D Conv2dAttributes::dilation() const
+{
+    return _dilation;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
new file mode 100644
index 0000000000..93881508bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+
+#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Services that are used throughout the creation phase of workload code
+ */
+class GpuComponentServices
+{
+public:
+    /** Default constructor */
+    GpuComponentServices() = default;
+    /** Get reference to component factory */
+    GpuKernelComponentFactory &component_factory()
+    {
+        return _comp_factory;
+    }
+
+private:
+    GpuKernelComponentFactory _comp_factory{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
new file mode 100644
index 0000000000..9cecfc2ffd
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1)
+{
+    return info0.type == info1.type;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
new file mode 100644
index 0000000000..eb36e91d48
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain information required to set up a kernel argument at run time
+ */
+struct GpuKernelArgumentInfo
+{
+    /** Enumerate all the tensor arguments variants used by all kernel implementations.  */
+    enum class Type : int
+    {
+        Scalar,
+
+        Vector,
+
+        Image,
+        Image_Reinterpret_As_3D,
+        Image_Export_To_ClImage2D,
+
+        Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+        Image_3D_Export_To_ClImage2D,
+
+        Tensor_3D,
+        Tensor_4D,
+        Tensor_4D_t_Buffer,
+        Tensor_4D_t_Image
+    };
+    /** Default constructor */
+    GpuKernelArgumentInfo() = default;
+    /** Constructor */
+    GpuKernelArgumentInfo(Type type)
+        : type{ type }
+    {
+    }
+    Type type{ Type::Tensor_4D_t_Buffer };
+};
+
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
+
+/** Kernel argument information linked with its corresponding @ref ITensorInfo
+ */
+class GpuKernelArgument
+{
+public:
+    /** Constructor
+     *
+     * @param[in] tensor_info     Associated @ref ITensorInfo
+     * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
+     */
+    GpuKernelArgument(const ITensorInfo           &tensor_info,
+                      const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{ tensor_info },
+          _kernel_arg_info{ kernel_arg_info }
+    {
+    }
+    /** Get workload tensor id */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get associated @ref ITensorInfo */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get associated @ref ITensorInfo */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get associated @ref GpuKernelArgumentInfo */
+    GpuKernelArgumentInfo *kernel_argument_info()
+    {
+        return &_kernel_arg_info;
+    }
+    /** Get associated @ref GpuKernelArgumentInfo */
+    const GpuKernelArgumentInfo *kernel_argument_info() const
+    {
+        return &_kernel_arg_info;
+    }
+    /** Check if the associated workload tensor has valid id
+     *
+     * @return true if has valid id
+     * @return false  otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    GpuKernelArgumentInfo _kernel_arg_info{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
new file mode 100644
index 0000000000..6e6422c957
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGraph.h"
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Automatically create memory descriptors for all tensors in the graph
+ *
+ * @param[in] tensors @ref ITensorInfo map
+ * @param[in] graph   @ref DependencyGraph of which the @p tensors are a part
+ *
+ * @return MemoryDescriptorMap  An assignment map of @ref MemoryDescriptors for each ITensorInfo in the graph
+ */
+MemoryDescriptorMap assign_memory_descriptors(const std::map<ITensorInfo::Id, const ITensorInfo *> tensors, const DependencyGraph &graph)
+{
+    MemoryDescriptorMap mem_map{};
+    for(auto t_id : graph.all_tensors())
+    {
+        const auto &tensor = tensors.at(t_id);
+        // Only global src and dst tensors to the entire component graph are "User" tensors, which are user-specified memories
+        if(is_in(t_id, graph.global_src_tensors()) || is_in(t_id, graph.global_dst_tensors()))
+        {
+            mem_map[t_id] = MemoryDescriptor{ MemoryType::User };
+        }
+        else
+        {
+            AuxMemoryInfo aux_mem_info{ tensor->total_size() };
+            mem_map[t_id] = MemoryDescriptor{ MemoryType::Auxiliary, aux_mem_info };
+        }
+    }
+    return mem_map;
+}
+
+} // namespace
+
+std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(
+        std::begin(tensors), std::end(tensors),
+        std::back_inserter(tensor_ids),
+        [](const auto & t)
+    {
+        return t->id();
+    });
+    return tensor_ids;
+}
+
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services)
+    : _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+{
+}
+
+GpuKernelComponentStream GpuKernelComponentGraph::fuse() const
+{
+    // Obtain memory descriptor map
+    const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph);
+    /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup
+    /// Fusion can be framed as a mathematical optimization problem:
+    /// Given fusion constraints, find the "best" fusion patterns possible
+    /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one
+    /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end
+
+    /// As the first iteration, we offer a sub-optimal algorithm here which ensures all
+    /// constraints are met, but provides no guarantee that the fusion pattern is optimal
+
+    GpuKernelComponentStream stream{ _services, mem_map };
+    // Break down into linear groups of components (constraint 1), preserving topological order
+    const auto linear_graphs = _dependency_graph.topological_partition();
+
+    // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants)
+    for(const auto &graph : linear_graphs)
+    {
+        for(unsigned int i = 0; i < graph.size(); ++i)
+        {
+            const auto comp = _components.at(graph[i].op).get();
+            // Each new linear graph signals a new component group in the stream
+            if(i == 0)
+            {
+                stream.new_component_group();
+            }
+            // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group
+            bool success = stream.add_component(comp);
+            if(!success)
+            {
+                stream.new_component_group();
+                success = stream.add_component(comp);
+                ARM_COMPUTE_ERROR_ON(!success);
+            }
+        }
+    }
+    return stream;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
new file mode 100644
index 0000000000..fbcb2c10ea
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components
+ * Its main purposes are:
+ *  - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators)
+ *  - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology
+ */
+class GpuKernelComponentGraph
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services @ref GpuComponentServices to be used by the graph
+     */
+    GpuKernelComponentGraph(GpuComponentServices *services);
+    /** Prevent instances of this class from being copy constructed */
+    GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
+    /** Prevent instances of this class from being copied */
+    GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default;
+    /** Create a new component and add it to the component graph
+     * Component id is automatically allocated
+     *
+     * @tparam T    Component type
+     * @tparam Args Component argument types
+     *
+     * @param[in] args Component arguments except for component id, which is auto-allocated
+     */
+    template <typename T, typename... Args>
+    void add_new_component(Args &&... args)
+    {
+        auto                      comp           = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors        = comp->tensors();
+        const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
+        const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
+        bool                      success        = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        ARM_COMPUTE_ERROR_ON(!success);
+        _components[comp->id()] = std::move(comp);
+        for(auto t : tensors.get_const_src_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+        for(auto t : tensors.get_const_dst_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+    }
+    /** Perform component fusion and serialize the graph into a stream of component groups
+     */
+    GpuKernelComponentStream fuse() const;
+
+private:
+    static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+    GpuComponentServices *_services;
+    std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
+    std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
+    DependencyGraph _dependency_graph{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
new file mode 100644
index 0000000000..3af4c1429d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGroup.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool GpuKernelComponentGroup::add_component(ComponentPtr component)
+{
+    // note: Constraint 1 is guaranteed as a precondition
+    // Constraint 2
+    if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable + Output)
+    if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    {
+        return false;
+    }
+    // Constraint 3.3: Disallow multiple output components
+    if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 4
+    if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    {
+        return false;
+    }
+    // Constraint 7
+    if(!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = component->tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 8
+    if(!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = component->tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 9
+    if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    {
+        return false;
+    }
+    // Constraint 9 corollary
+    if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    {
+        return false;
+    }
+    _components.push_back(component);
+    return true;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const
+{
+    if(_components.empty())
+    {
+        return {};
+    }
+    auto src_tensors     = _components[0]->tensors().get_const_src_tensors();
+    auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+    for(unsigned int i = 1; i < _components.size(); ++i)
+    {
+        auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors();
+        for(const auto src_tensor : cur_src_tensors)
+        {
+            if(src_tensor->id() == prev_dst_tensor->id())
+            {
+                continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components
+            }
+            src_tensors.push_back(src_tensor);
+        }
+        prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+    }
+
+    return src_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const
+{
+    if(_components.empty())
+    {
+        return {};
+    }
+    const auto                       dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors();
+    std::vector<const ITensorInfo *> dst_tensors;
+    for(auto tensor_ptr : dst_tensor_ptrs)
+    {
+        dst_tensors.push_back(tensor_ptr);
+    }
+    return dst_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
+{
+    std::vector<const ITensorInfo *> arguments;
+    const auto                       src_tensors = get_src_tensors();
+    const auto                       dst_tensors = get_dst_tensors();
+    arguments.reserve(src_tensors.size() + dst_tensors.size());
+    arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end());
+    arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end());
+    return arguments;
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    return _components[0];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    return _components[_components.size() - 1];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const
+{
+    if(empty())
+    {
+        return nullptr;
+    }
+    // Get the index of the requested component
+    size_t ind = 0;
+    for(const auto c : _components)
+    {
+        if(c->id() == id)
+        {
+            break;
+        }
+        ind++;
+    }
+    if(ind == 0 || ind >= _components.size())
+    {
+        return nullptr;
+    }
+    return _components[ind - 1];
+}
+
+bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
+{
+    return is_tensor_in(tensor, get_interm_tensors());
+}
+
+size_t GpuKernelComponentGroup::size() const
+{
+    return _components.size();
+}
+bool GpuKernelComponentGroup::empty() const
+{
+    return _components.empty();
+}
+GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
+{
+    return _components[index];
+}
+const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
+{
+    return _components[index];
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
+{
+    return _components.begin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
+{
+    return _components.end();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
+{
+    return _components.cend();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
+{
+    return _components.cend();
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const
+{
+    std::vector<const ITensorInfo *> interm_tensors{};
+    for(unsigned int i = 0; i + 1 < _components.size(); ++i)
+    {
+        auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0];
+        interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component
+    }
+
+    return interm_tensors;
+}
+
+bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors)
+{
+    for(auto t : tensors)
+    {
+        if(tensor->id() == t->id())
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
new file mode 100644
index 0000000000..4c9d940594
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+
+#include "components/Types.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+/** A group of gpu kernel components to be fused together
+ * PRECONDITIONS:
+ * 1. Fusion is limited to a linear sequence of kernel components
+ * INVARIANTS:
+ * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components
+ * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components (
+ *        excluding any output or input (if any) components.
+ *        The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group
+ *    )
+ * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output?
+ * 4. All components but unfusable, have exactly 1 dst tensor
+ * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. )
+ * 6. All fused components share the same tunable parameters like tile size
+ * 7. All fused components share the same dst tensor shape
+ * 8. All fused components' tensors share the same @ref DataLayout
+ * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors
+ *      This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors
+ */
+class GpuKernelComponentGroup
+{
+public:
+    using ComponentPtr = IGpuKernelComponent *;
+    /** Maximum number of components that can be fused into the same component group
+     */
+    static constexpr size_t max_fused_components = 64;
+    /** Maximum number of dst tensors allowed for a component / component
+     */
+    static constexpr size_t max_dst_tensors = 8;
+
+public:
+    /** Default constructor */
+    GpuKernelComponentGroup() = default;
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default;
+    /** Add a component pointer into the group
+     * If the operation fails, then no change is made to the group
+     *
+     * @param[in] component Pointer to the component to be added
+     *
+     * @return true      If the operation is successful
+     * @return false     If the operation fails
+     */
+    bool add_component(ComponentPtr component);
+    /** Get source tensors of this group */
+    std::vector<const ITensorInfo *> get_src_tensors() const;
+    /** Get destination tensors of this group */
+    std::vector<const ITensorInfo *> get_dst_tensors() const;
+    /** Get tensor argument of this group
+     *  A tensor is an argument if it is a source or destination tensor to the group
+     */
+    std::vector<const ITensorInfo *> get_argument_tensors() const;
+    /** Get the root (first) component of this group */
+    ComponentPtr get_root_component() const;
+    /** Get the last component of this group */
+    ComponentPtr get_last_component() const;
+    /** Get the previous component to the component with id @p id
+     *
+     * @param[in] id Component id of the component whose previous component is of concern
+     *
+     * @return ComponentPtr  Pointer to the previous component of the one identified by @p id
+     */
+    ComponentPtr get_previous_component(ComponentId id) const;
+    /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
+     *
+     * An intermediate tensor is any tensor that is not an argument.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up
+     *
+     * @return true  If @p tensor is an intermediate tensor
+     * @return false  Otherwise
+     */
+    bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+    /** Get the number of components within the group */
+    size_t size() const;
+    /** Check if the component group is empty */
+    bool empty() const;
+    ComponentPtr &operator[](size_t index);
+    const ComponentPtr &operator[](size_t index) const;
+    typename std::vector<ComponentPtr>::iterator       begin();
+    typename std::vector<ComponentPtr>::iterator       end();
+    typename std::vector<ComponentPtr>::const_iterator begin() const;
+    typename std::vector<ComponentPtr>::const_iterator end() const;
+    typename std::vector<ComponentPtr>::const_iterator cbegin() const;
+    typename std::vector<ComponentPtr>::const_iterator cend() const;
+
+private:
+    std::vector<const ITensorInfo *> get_interm_tensors() const;
+
+    static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors);
+
+    std::vector<ComponentPtr> _components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
new file mode 100644
index 0000000000..aac84b6c59
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentStream.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
+    : _services{ services }, _component_groups{}, _mem_map{ mem_map }
+{
+}
+
+GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
+{
+    GpuWorkloadSourceCode source_code;
+    // Traverse through component groups and assemble workload together
+    for(auto && group : _component_groups)
+    {
+        // Write kernel code
+        GpuLogicalKernel          logical_kernel(_services, group);
+        const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
+        // The whole unit workload stage is determined by the root component
+        const auto unit_workload_stage = group.get_root_component()->properties().stage();
+        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map);
+    }
+    return source_code;
+}
+
+void GpuKernelComponentStream::new_component_group()
+{
+    _component_groups.emplace_back();
+}
+
+bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component)
+{
+    ARM_COMPUTE_ERROR_ON(_component_groups.empty());
+    return _component_groups.back().add_component(component);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
new file mode 100644
index 0000000000..cbaa7c297b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph
+ *  Each component group in the stream denotes a complete kernel that may consist of multiple components
+ *
+ * The main purposes of this class are:
+ *  - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream
+ *  - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode
+ */
+class GpuKernelComponentStream
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services @ref GpuComponentServices to be used throughout the stream
+     * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
+     */
+    GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default;
+    /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */
+    GpuWorkloadSourceCode write_workload_code();
+    /** Insert a new component group in the stream.
+     * Subsequent components are added to this group until end of stream or the next new_component_group is called
+     */
+    void new_component_group();
+    /** Add a component to the previously created component group
+     *  Throw an error if no component group is present in the stream
+     *
+     * @param[in] component Component to be inserted
+     *
+     * @return true      If the operation is successful
+     * @return false     Otherwise
+     */
+    bool add_component(IGpuKernelComponent *component);
+
+private:
+    GpuComponentServices                *_services;
+    std::vector<GpuKernelComponentGroup> _component_groups{};
+    MemoryDescriptorMap                  _mem_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
new file mode 100644
index 0000000000..7479328d7b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** The argument list of a @ref GpuKernelSourceCode */
+using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
+
+/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
+ */
+class GpuKernelSourceCode
+{
+public:
+    /** Set kernel name */
+    GpuKernelSourceCode &name(const std::string &n)
+    {
+        _name = n;
+        return *this;
+    }
+    /** Set kernel code */
+    GpuKernelSourceCode &code(const std::string &c)
+    {
+        _code = c;
+        return *this;
+    }
+    /** Set kernel config id string */
+    GpuKernelSourceCode &config_id(const std::string &c_id)
+    {
+        _config_id = c_id;
+        return *this;
+    }
+    /** Set kernel build options */
+    GpuKernelSourceCode &build_options(const CLBuildOptions &b_options)
+    {
+        _build_options = b_options;
+        return *this;
+    }
+    /** Set kernel execution window */
+    GpuKernelSourceCode &window(const Window &window)
+    {
+        _window = window;
+        return *this;
+    }
+    /** Set kernel argument list */
+    GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments)
+    {
+        _arguments = arguments;
+        return *this;
+    }
+    /** Get kernel name */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Get kernel code */
+    std::string code() const
+    {
+        return _code;
+    }
+    /** Get kernel config id string */
+    std::string config_id() const
+    {
+        return _config_id;
+    }
+    /** Get kernel build options */
+    const CLBuildOptions &build_options() const
+    {
+        return _build_options;
+    }
+    /** Get kernel execution window */
+    const Window &window() const
+    {
+        return _window;
+    }
+    /** Get kernel argument list */
+    const GpuKernelArgumentList &arguments() const
+    {
+        return _arguments;
+    }
+
+private:
+    std::string           _name{};
+    std::string           _code{};
+    std::string           _config_id{};
+    CLBuildOptions        _build_options{};
+    Window                _window{};
+    GpuKernelArgumentList _arguments{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
new file mode 100644
index 0000000000..7746f8bbf3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuLogicalKernel.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
+    : _services{ services }, _comp_group{ components }, _store_components{}
+{
+    add_load_store();
+}
+
+GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
+{
+    GpuKernelSourceCode code;
+    ClTemplateWriter    writer{ _comp_group };
+
+    code.name(writer.get_name());
+    code.code(writer.get_code());
+    code.arguments(writer.get_tensors());
+    code.build_options(writer.get_build_options());
+    code.config_id(writer.get_config_id());
+    code.window(writer.get_window());
+
+    return code;
+}
+
+void GpuLogicalKernel::add_load_store()
+{
+    const auto dst_tensors = _comp_group.get_dst_tensors();
+    // Each dst tensor from the component group requires exactly one store component
+    for(const auto &dst_tensor : dst_tensors)
+    {
+        ArgumentPack<ITensorInfo> tensors;
+        // Pass same destination tensor to both source and destination of the store component
+        // In other words, the addition of a store component does not create a new dst tensor
+        // This way we avoid the issue of the dst tensor of the component group differs from that of a logical kernel
+        // This may seem to violate the acyclic-ness of the component graph. But it is fine because at the point of
+        // the construction of the logical kernel, we do not need a graph representation of components anymore
+        // (the graph has been serialized)
+        tensors.add_const_tensor(ACL_SRC_0, dst_tensor);
+        tensors.add_const_tensor(ACL_DST_0, dst_tensor);
+
+        auto store = _services->component_factory().create<ClComponentStore>(
+                         _comp_group.get_root_component()->properties(), // Store component share the same properties as that of the root component
+                         tensors);
+        _store_components.push_back(std::move(store));
+        auto success = _comp_group.add_component(_store_components.back().get());
+        ARM_COMPUTE_ERROR_ON(!success); // It's guaranteed that any load store insertion should be successful
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
new file mode 100644
index 0000000000..4ce4443f60
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A wrapper-processor of a @ref GpuKernelComponentGroup
+ * It adds the load (if any) and store components to the component group
+ * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code
+ */
+class GpuLogicalKernel
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services   @ref GpuComponentServices to be used
+     * @param[in] components Component group from which this logical kernel is initialized
+     */
+    explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components);
+    /** Allow instances of this class to be copy constructed */
+    GpuLogicalKernel(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be copied */
+    GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuLogicalKernel(GpuLogicalKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default;
+    /** Generate a @ref GpuKernelSourceCode */
+    GpuKernelSourceCode write_kernel_code();
+
+private:
+    void add_load_store();
+
+    GpuComponentServices                             *_services;
+    GpuKernelComponentGroup                           _comp_group{};
+    std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
new file mode 100644
index 0000000000..e8ef835405
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(
+        std::begin(tensors), std::end(tensors),
+        std::back_inserter(tensor_ids),
+        [](const auto & t)
+    {
+        return t->id();
+    });
+    return tensor_ids;
+}
+
+} // namespace
+
+Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
+    : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+{
+}
+
+OperatorId Operator::id() const
+{
+    return _id;
+}
+
+GpuOperatorType Operator::operator_type() const
+{
+    return _operator_type;
+}
+
+ArgumentPack<ITensorInfo> Operator::tensors() const
+{
+    return _tensors;
+}
+
+bool GpuOperatorGroup::try_add_operator(const Operator &op) const
+{
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    // Constraint 1
+    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids))
+    {
+        return false;
+    }
+    // Constraint 2
+    if(_operators.size() >= max_fused_operators)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable)
+    if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    {
+        return false;
+    }
+    // Constraint 4
+    if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if(_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = op.tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 6
+    if(_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = op.tensors().get_const_dst_tensors();
+        for(const auto &t : root_dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for(const auto &t : dst_tensors)
+        {
+            if(t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+void GpuOperatorGroup::add_operator(const Operator &op)
+{
+    ARM_COMPUTE_ERROR_ON(!try_add_operator(op));
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids);
+    _operators[op.id()] = op;
+}
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+{
+    auto new_id = static_cast<OperatorId>(_operators.size());
+    return Operator{ new_id, operator_type, tensors };
+}
+const Operator *GpuOperatorGroup::get_root_operator() const
+{
+    const auto roots = _graph.get_root_ops();
+    ARM_COMPUTE_ERROR_ON(roots.size() > 1);
+    if(roots.empty())
+    {
+        return nullptr;
+    }
+    return &_operators.at(roots[0]);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
new file mode 100644
index 0000000000..35abe6c543
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using OperatorId = DependencyGraph::OperatorId;
+
+/** An operator for the sole purpose of validating fusion
+ */
+class Operator
+{
+public:
+    /** Default constructor */
+    Operator() = default;
+    /** Get Operator Id */
+    OperatorId id() const;
+    /** Get operator type */
+    GpuOperatorType operator_type() const;
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const;
+    friend class GpuOperatorGroup;
+
+private:
+    Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors);
+    OperatorId                _id{};
+    GpuOperatorType           _operator_type{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+
+/** A linear sequence of operators to be fused in a workload
+ *  For the time being, this class is only used for validating operator fusion
+ * INVARIANTS:
+ * @note These invariants are exactly the same as operator fusion constraints
+ * 1. Fusion is limited to a linear sequence of operators
+ * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators
+ * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable
+ * 4. All operator but unfusable, have exactly 1 dst tensor
+ * 5. All fused operators share the same dst tensor shape
+ * 6. All fused operators' tensors share the same @ref DataLayout
+ */
+class GpuOperatorGroup
+{
+public:
+    static constexpr size_t max_fused_operators = 32;
+    /** Try adding (without actually adding) an operator to the group
+     *
+     * @param[in] op Operator to be added
+     *
+     * @return true   If @p op can be added while maintaining the invariants
+     * @return false  Otherwise
+     */
+    bool try_add_operator(const Operator &op) const;
+    /** Add an operator to the group
+     *
+     * @param[in] op Operator to be added
+     */
+    void add_operator(const Operator &op);
+    /** Create a new operator
+     *
+     * @param[in] operator_type @ref GpuOperatorType of the new operator
+     * @param[in] tensors       Tensor arguments to the new operator
+     *
+     * @return Operator
+     */
+    Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const;
+    /** Get the "root operator" of the group, which is the first operator in a linear sequence
+     * @return const Operator* Pointer to the root operator
+     */
+    const Operator *get_root_operator() const;
+
+private:
+    DependencyGraph _graph{};
+    std::map<OperatorId, Operator> _operators{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
new file mode 100644
index 0000000000..c77697c343
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain properties common to all operator types */
+
+/** Operator type in the context of fusion
+ */
+enum class GpuOperatorType
+{
+    /** Simple operators are operators that:
+     *  1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise
+     *  2. Have exactly 1 output
+     */
+    Simple,
+    /** Complex operators are operators that are not simple but are still fusable with simple ones
+     */
+    Complex,
+    /** Unfusable operators are operators that cannot be fused with any other types of operators
+     */
+    Unfusable
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
new file mode 100644
index 0000000000..623bf351f8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
+    : _gpu_language{ GpuLanguage::OpenCL }, _cl_compile_ctx{ cl_compile_ctx }
+{
+}
+
+GpuTarget GpuWorkloadContext::gpu_target() const
+{
+    return _cl_compile_ctx->get_gpu_target();
+}
+
+GpuLanguage GpuWorkloadContext::gpu_language() const
+{
+    return _gpu_language;
+}
+
+const CLCompileContext *GpuWorkloadContext::cl_compile_context() const
+{
+    return _cl_compile_ctx;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
new file mode 100644
index 0000000000..ce7cf1e908
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
+    : _impl{ std::make_unique<Implementation>(context) }
+{
+}
+GpuWorkloadSketch::~GpuWorkloadSketch()
+{
+}
+
+const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const
+{
+    return _impl->context();
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info(const ITensorInfo &tensor_info)
+{
+    TensorInfo tensor{ tensor_info };
+    tensor.set_id(allocate_new_tensor_id());
+    return tensor;
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info()
+{
+    TensorInfo tensor{};
+    tensor.set_id(allocate_new_tensor_id());
+    return tensor;
+}
+
+ITensorInfo::Id GpuWorkloadSketch::allocate_new_tensor_id()
+{
+    return _impl->allocate_new_tensor_id();
+}
+
+GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation()
+{
+    return *_impl;
+}
+const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const
+{
+    return *_impl;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
new file mode 100644
index 0000000000..3997395c98
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of @ref GpuWorkloadSketch */
+class GpuWorkloadSketch::Implementation
+{
+public:
+    /** Constructor
+     *
+     * @param[in] context global workload creation context
+     */
+    explicit Implementation(
+        Context *context)
+        : _context{ context },
+          _comp_services{},
+          _component_graph{ &_comp_services },
+          _operator_group{}
+    {
+    }
+    /** Prevent instances of this class from being copy constructed */
+    Implementation(const Implementation &impl) = delete;
+    /** Prevent instances of this class from being copied */
+    Implementation &operator=(const Implementation &impl) = delete;
+    /** Allow instances of this class to be move constructed */
+    Implementation(Implementation &&impl) = default;
+    /** Allow instances of this class to be moved */
+    Implementation &operator=(Implementation &&impl) = default;
+    /** Get workload context */
+    const Context *context() const
+    {
+        return _context;
+    }
+    /** Get component graph */
+    const GpuKernelComponentGraph &component_graph() const
+    {
+        return _component_graph;
+    }
+    /** Get component graph */
+    GpuKernelComponentGraph &component_graph()
+    {
+        return _component_graph;
+    }
+    /** Get operator group */
+    const GpuOperatorGroup &operator_group() const
+    {
+        return _operator_group;
+    }
+    /** Get operator group */
+    GpuOperatorGroup &operator_group()
+    {
+        return _operator_group;
+    }
+    ITensorInfo::Id allocate_new_tensor_id()
+    {
+        return ++_next_id;
+    }
+    /** Generate @ref GpuWorkloadSourceCode from the workload sketch
+     * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
+     *
+     * @return GpuWorkloadSourceCode  The generated workload code
+     */
+    GpuWorkloadSourceCode generate_source_code() const
+    {
+        return component_graph().fuse().write_workload_code();
+    }
+
+private:
+    Context                *_context;
+    GpuComponentServices    _comp_services;
+    GpuKernelComponentGraph _component_graph;
+    GpuOperatorGroup        _operator_group;
+    ITensorInfo::Id         _next_id{ ITensorInfo::invalid_tensor_id };
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
new file mode 100644
index 0000000000..2375f5c6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
+using UnitWorkloadId = int32_t;
+
+/** Describes all the info related to a kernel in order to:
+ *  - be used by runtime to configure gpu kernel argument
+ *  - be used by memory managers to allocate required memory
+ */
+class GpuWorkloadArgument
+{
+public:
+    /** Default constructor */
+    GpuWorkloadArgument() = default;
+    /** Constructor
+     *
+     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument
+     */
+    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
+                        const MemoryDescriptor      &mem_desc,
+                        const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{ tensor_info },
+          _mem_desc{ mem_desc },
+          _kernel_arg_info{ kernel_arg_info }
+    {
+    }
+    /** Get tensor id within workload */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get @ref ITensorInfo of the argument */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref ITensorInfo of the argument */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    MemoryDescriptor *memory_descriptor()
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    const MemoryDescriptor *memory_descriptor() const
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref GpuKernelArgumentInfo of the argument */
+    GpuKernelArgumentInfo *kernel_argument_info()
+    {
+        return &_kernel_arg_info;
+    }
+    /** Get @ref GpuKernelArgumentInfo of the argument */
+    const GpuKernelArgumentInfo *kernel_argument_info() const
+    {
+        return &_kernel_arg_info;
+    }
+    /** Check if the workload argument has valid id
+     *
+     * @return true   If has valid id
+     * @return false  Otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    MemoryDescriptor      _mem_desc{};
+    GpuKernelArgumentInfo _kernel_arg_info{};
+};
+
+/** Describes when a unit workload is run.
+ */
+struct UnitWorkloadStage
+{
+    enum class Stage
+    {
+        Prepare, /**< Only run once at the beginning. */
+        Run,     /**< Run every time after the first time. */
+    };
+    Stage stage{ Stage::Run };
+};
+
+inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+{
+    return stage0.stage == stage1.stage;
+}
+
+/** The atomic unit in a Gpu workload. It contains exactly one kernel to run.
+ */
+class GpuUnitWorkload
+{
+public:
+    /** Default constructor */
+    GpuUnitWorkload() = default;
+    /** Constructor
+     *
+     * @param[in] id          Id that uniquely identifies this unit workload in a workload
+     * @param[in] kernel_code @ref GpuKernelSourceCode contained within
+     * @param[in] stage       Stage of the unit workload
+     */
+    GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
+        : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+    {
+    }
+    /** Get the id of the unit workload */
+    UnitWorkloadId id() const
+    {
+        return _id;
+    }
+    /** Get reference to the underlying @ref GpuKernelSourceCode */
+    const GpuKernelSourceCode &code() const
+    {
+        return _kernel_code;
+    }
+    /** Get the stage of the unit workload */
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadId      _id{};
+    GpuKernelSourceCode _kernel_code{};
+    UnitWorkloadStage   _stage{};
+};
+
+/** Hold the generated kernel source code and other information required to compile and run the workload.
+ */
+class GpuWorkloadSourceCode
+{
+public:
+    /** Default constructor */
+    GpuWorkloadSourceCode() = default;
+    /** Add a unit workload to the workload code
+     *
+     * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
+     * @param[in] stage       Stage of the unit workload
+     * @param[in] mem_map     @ref MemoryDescriptor map for all tensors within the unit workload
+     *
+     * @return UnitWorkloadId  Allocated unit workload id
+     */
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map)
+    {
+        // Use the size of the kernel codes as Id
+        const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
+        const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
+        _unit_workloads.push_back(unit_work);
+        // Assemble kernel argument with memory descriptor to form workload argument
+        for(const auto &id_arg : kernel_code.arguments())
+        {
+            const auto arg_id           = id_arg.first;
+            const auto arg              = id_arg.second;
+            _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
+            if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+            {
+                _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
+            }
+            _tensor_uwork_map[arg_id].insert(uwk_id);
+        }
+        return uwk_id;
+    }
+    /** Get a unit workload from its id */
+    const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const
+    {
+        ARM_COMPUTE_ERROR_ON(id < 0);
+        return _unit_workloads.at(id);
+    }
+    /** Get all unit workloads sorted in topological order */
+    std::vector<UnitWorkloadId> unit_workloads() const
+    {
+        std::vector<UnitWorkloadId> ids{};
+
+        for(const auto &uwk : _unit_workloads)
+        {
+            ids.push_back(uwk.id());
+        }
+        return ids;
+    }
+    /** Get a @ref GpuWorkloadArgument from its associated tensor id */
+    const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const
+    {
+        return &_workload_arguments.at(t_id);
+    }
+    /** Get all tensors in the entire workload */
+    std::vector<ITensorInfo::Id> tensors() const
+    {
+        std::vector<ITensorInfo::Id> ids{};
+        for(const auto &id_tensor : _workload_arguments)
+        {
+            ids.push_back(id_tensor.first);
+        }
+        return ids;
+    }
+    /** Get all unit workloads connected to the tensor with @p t_id */
+    std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const
+    {
+        const auto unit_work_set = _tensor_uwork_map.at(t_id);
+        return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end());
+    }
+
+private:
+    std::vector<GpuUnitWorkload> _unit_workloads{};
+    std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
+    std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
new file mode 100644
index 0000000000..ae67790b4b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** An interface that can write a gpu kernel
+ */
+class IGpuKernelWriter
+{
+public:
+    /** Destructor */
+    virtual ~IGpuKernelWriter()
+    {
+    }
+    /** Generate kernel name */
+    virtual std::string get_name() = 0;
+    /** Generate kernel code */
+    virtual std::string get_code() = 0;
+    /** Generate build options */
+    virtual CLBuildOptions get_build_options() = 0;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    virtual std::string get_config_id() = 0;
+    /** Generate execution window */
+    virtual Window get_window() const = 0;
+    /** Get the kernel argument lists of the kernel*/
+    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
new file mode 100644
index 0000000000..f7f0029618
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+
+#include "Types.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids
+ */
+class GpuKernelComponentFactory
+{
+public:
+    /** Create a new kernel component
+     *
+     * @tparam T      Any polymorphic type descending from @ref IGpuKernelComponent
+     * @tparam Args   Argument types to construct the kernel component
+     *
+     * @param[in] args Arguments to construct the kernel component
+     *
+     * @return std::unique_ptr<IGpuKernelComponent>
+     */
+    template <typename T, typename... Args>
+    std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+    {
+        return std::make_unique<T>(_count++, std::forward<Args>(args)...);
+    }
+
+private:
+    ComponentId _count{ 0 };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
new file mode 100644
index 0000000000..8bb19155a2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+
+#include "Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Properties common to all kernel component types */
+class KernelProperties
+{
+public:
+    KernelProperties &stage(const UnitWorkloadStage &stage)
+    {
+        _stage = stage;
+        return *this;
+    }
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadStage _stage{};
+};
+
+inline bool operator==(const KernelProperties &config0, const KernelProperties &config1)
+{
+    return config0.stage() == config1.stage();
+}
+
+/** Forward declaration */
+class IGpuTemplateComponentWriter;
+
+/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion
+ */
+class IGpuKernelComponent
+{
+public:
+    using Properties = KernelProperties;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] properties Kernel component properties
+     * @param[in] tensors    Tensor arguments to the components
+     */
+    IGpuKernelComponent(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors)
+        : _id{ id },
+          _properties{ properties },
+          _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuKernelComponent()
+    {
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get properties */
+    Properties properties() const
+    {
+        return _properties;
+    }
+    /** Get template writer for the component */
+    virtual const IGpuTemplateComponentWriter *template_writer() const = 0;
+    /** Get component type */
+    virtual GpuComponentType type() const = 0;
+
+private:
+    ComponentId               _id{ -1 };
+    Properties                _properties{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */
diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h
new file mode 100644
index 0000000000..54b3a69057
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/Types.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a kernel component within a workload
+ */
+using ComponentId = int32_t;
+
+/** Component type in the context of fusion
+ *  Its main purpose is to inform the optimizer how to perform fusion.
+ */
+enum class GpuComponentType
+{
+    Complex,
+    Simple,
+    Unfusable,
+    Output
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
new file mode 100644
index 0000000000..e94cfd1581
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using Settings = ClComponentDirectConv2dSettings;
+
+Settings &Settings::export_to_cl_image(bool cl_image)
+{
+    _export_to_cl_image = cl_image;
+    return *this;
+}
+
+bool Settings::export_to_cl_image() const
+{
+    return _export_to_cl_image;
+}
+
+Settings &Settings::fast_relaxed_math(bool fast_relaxed_math)
+{
+    _fast_relaxed_math = fast_relaxed_math;
+    return *this;
+}
+
+bool Settings::fast_relaxed_math() const
+{
+    return _fast_relaxed_math;
+}
+
+Status ClComponentDirectConv2d::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties, settings);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+    // 1. Check validity
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+    }
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+    }
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+    }
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    // wei shape is correct
+    const DataLayout data_layout = src->data_layout();
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    // dst shape is correct
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                    attributes.pad().bottom, DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+
+    // bia shape is correct
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
+                                        "Biases should be one dimensional");
+    }
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+ClComponentDirectConv2d::ClComponentDirectConv2d(
+    ComponentId                      id,
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors,
+    const Attributes                &attributes,
+    const Settings                  &settings)
+    : IGpuKernelComponent{ id, properties, tensors },
+      _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+{
+}
+ClComponentDirectConv2d::~ClComponentDirectConv2d()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
new file mode 100644
index 0000000000..fec22b84a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Conv2dAttributes;
+
+/** Component specific settings
+ */
+class ClComponentDirectConv2dSettings
+{
+public:
+    /** Set export_to_cl_image flag */
+    ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image);
+    /** Get export_to_cl_image flag */
+    bool export_to_cl_image() const;
+
+    /** Set fast_relaxed_math flag */
+    ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+    /** Get fast_relaxed_math flag */
+    bool fast_relaxed_math() const;
+
+private:
+    bool _export_to_cl_image{ false };
+    bool _fast_relaxed_math{ true };
+};
+
+/** Forward declaration */
+class ClTemplateDirectConv2d;
+
+class ClComponentDirectConv2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = Conv2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = ClComponentDirectConv2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_SRC_1: Weight
+     * - ACL_SRC_2: Bias (Optional)
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_SRC_2: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_SRC_2      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     */
+    ClComponentDirectConv2d(
+        ComponentId                      id,
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors,
+        const Attributes                &attributes,
+        const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentDirectConv2d() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
new file mode 100644
index 0000000000..f49f397ec1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentStore.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentStore::validate(
+    const Properties                &properties,
+    const ArgumentPack<ITensorInfo> &tensors)
+{
+    ARM_COMPUTE_UNUSED(properties, tensors);
+    return Status{};
+}
+ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) }
+{
+}
+ClComponentStore::~ClComponentStore()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
new file mode 100644
index 0000000000..bf8c9f031e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+class ClComponentStore final : public IGpuKernelComponent
+{
+public:
+    /** Validate the component
+     *
+     * @param[in] properties Component properties
+     * @param[in] tensors    Tensor arguments to the components
+     *
+     * @return Status        Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     */
+    static Status validate(
+        const Properties                &properties,
+        const ArgumentPack<ITensorInfo> &tensors);
+    /** Constructor
+     *
+     * Similar to @ref ClComponentStore::validate()
+     */
+    ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+    /** Destructor */
+    ~ClComponentStore() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentStore(const ClComponentStore &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentStore &operator=(const ClComponentStore &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentStore(ClComponentStore &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentStore &operator=(ClComponentStore &&component) = default;
+    /** Get template writer for the component */
+    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Output;
+    }
+
+private:
+    std::unique_ptr<ClTemplateStore> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
new file mode 100644
index 0000000000..98c1cc3939
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout)
+{
+    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
+    {
+        return false;
+    }
+
+    // If not floating point
+    if(!is_data_type_float(tensor->data_type()))
+    {
+        return false;
+    }
+
+    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+    {
+        return false;
+    }
+
+    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+    if(!image2d_from_buffer_supported(device))
+    {
+        return false;
+    }
+
+    // Check cl image pitch alignment
+    if(get_cl_image_pitch_alignment(device) == 0)
+    {
+        return false;
+    }
+
+    const size_t image_w     = tensor->tensor_shape()[0] / 4;
+    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
+    const size_t max_image_w = device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+    const size_t max_image_h = device.getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+    if(image_w > max_image_w || image_h > max_image_h)
+    {
+        return false;
+    }
+
+    return true;
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const ITensorInfo       *dst,
+                              const Conv2dAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        !src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id());
+    if(bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+    }
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate = *dst;
+    const auto data_layout          = src->data_layout();
+
+    {
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+                                                                            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                                                          attributes.pad().right,
+                                                                                          attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate Direct Conv2d Component
+        {
+            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            auto       settings   = ClComponentDirectConv2d::Settings();
+
+            settings.export_to_cl_image(
+                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+                && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16));
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+void GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
+                          ITensorInfo            *src,
+                          ITensorInfo            *wei,
+                          ITensorInfo            *bia,
+                          ITensorInfo            *dst,
+                          const Conv2dAttributes &attributes)
+{
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst);
+    const auto data_layout = src->data_layout();
+
+    // Auto initialize dst tensor
+    {
+        auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+                                                                            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                                                                          attributes.pad().right,
+                                                                                          attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add Direct Conv2d Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+
+            auto settings = ClComponentDirectConv2d::Settings();
+
+            settings.export_to_cl_image(
+                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+                && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+
+            if(settings.export_to_cl_image())
+            {
+                arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+            }
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
new file mode 100644
index 0000000000..13c0b141a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelVariableTable.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+    // Do not re-declare if the variable associated with the tensor has already been declared
+    if(get_variable(tensor).has_valid_id())
+    {
+        ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info));
+        return;
+    }
+    // Declare variable associated with the tensor
+    std::stringstream ss;
+    ss << alias << "_t" << tensor->id();
+    const auto     uniq_name = ss.str();
+    TensorVariable var{ tensor->id(), uniq_name, argument_info };
+
+    if(is_interm)
+    {
+        _interm_var = var;
+        _interm_tensors.insert(tensor->id());
+    }
+    else
+    {
+        _vars.emplace(tensor->id(), var);
+    }
+}
+
+GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
+{
+    const TensorVariable empty_var{};
+    if(_vars.find(tensor->id()) != _vars.end())
+    {
+        return _vars.at(tensor->id());
+    }
+    if(_interm_tensors.find(tensor->id()) != _interm_tensors.end())
+    {
+        return _interm_var;
+    }
+    return empty_var;
+}
+
+GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+{
+    VariableList vars{};
+    for(const auto &tensor : tensors)
+    {
+        if(!tensor->has_valid_id())
+        {
+            continue;
+        }
+        vars.push_back(get_variable(tensor));
+    }
+    return vars;
+}
+
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
+    : value{ var.uniq_name }
+{
+}
+
+TagVal::TagVal(const std::string &val)
+    : value{ val }
+{
+}
+
+TagVal::TagVal(const char *val)
+    : value{ std::string(val) }
+{
+}
+
+TagVal::TagVal(const DataType &data_type)
+    : value{ get_cl_type_from_data_type(data_type) }
+{
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
new file mode 100644
index 0000000000..4eee3963c2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "support/Requires.h"
+#include "support/StringSupport.h"
+
+#include <set>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A table of all the variables used in the kernel
+ * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed.
+ * Each kernel has exactly one variable table
+ */
+class GpuKernelVariableTable
+{
+public:
+    /** A tensor variable whose main purposes are:
+     *  - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
+     *  - Hold the generated variable name for the associated tensor info
+     */
+    struct TensorVariable
+    {
+    public:
+        TensorVariable()                       = default;
+        TensorVariable(const TensorVariable &) = default;
+        TensorVariable       &operator=(const TensorVariable &) = default;
+        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
+        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        GpuKernelArgumentInfo kernel_argument_info{};
+        bool                  has_valid_id() const
+        {
+            return id != ITensorInfo::invalid_tensor_id;
+        }
+    };
+    using VariableList = std::vector<TensorVariable>;
+
+public:
+    /** Declare a @ref TensorVariable for a corresponding tensor info.
+     *
+     * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo
+     *        Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order
+     *
+     * @param[in] tensor        Tensor info with which the new variable is associated
+     * @param[in] argument_info Kernel argument information
+     * @param[in] is_interm     If the new variable is an intermediate variable
+     * @param[in] alias         Alias for the variable. Will be used as part of the variable name
+     */
+    void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed");
+    /** Get the @ref TensorVariable associated with @p tensor
+     *
+     * @param[in] tensor Tensor info to be queried
+     *
+     * @return TensorVariable
+     */
+    TensorVariable get_variable(const ITensorInfo *tensor) const;
+    /** Get the @ref TensorVariable list associated with @p tensors
+     * @note Empty tensors are skipped
+     *
+     * @param[in] tensors List of tensor infos to be queried
+     *
+     * @return VariableList
+     */
+    VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
+
+private:
+    std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/
+    TensorVariable            _interm_var{};           /**< Intermediate variable */
+    std::set<ITensorInfo::Id> _interm_tensors{};       /**< Tensors associated with the single intermediate variable */
+};
+
+/** A tag value will substitute a tag in a string template during its instantiation */
+struct TagVal
+{
+    /** Default constructor */
+    TagVal() = default;
+    /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
+    TagVal(const GpuKernelVariableTable::TensorVariable &var);
+    /** Construct a @ref TagVal from an integral type */
+    template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
+    TagVal(T val)
+        : value{ support::cpp11::to_string(val) }
+    {
+    }
+    /** Construct a @ref TagVal from a string */
+    TagVal(const std::string &val);
+    /** Construct a @ref TagVal from a c-style string */
+    TagVal(const char *val);
+    /** Construct a @ref TagVal from a @ref DataType */
+    TagVal(const DataType &data_type);
+    /** Get the value of the TagVal as a converted string */
+    std::string value{};
+};
+
+/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
+using Tag = std::string;
+
+/** Tag lookup table. It is used to instantiate a string template */
+using TagLUT = std::unordered_map<Tag, TagVal>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
new file mode 100644
index 0000000000..c85ddf5a2c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuKernelVariableTable;
+
+/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
+ */
+class IGpuTemplateComponentWriter
+{
+public:
+    using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> tensors)
+        : _id{ id }, _tensors{ tensors }
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuTemplateComponentWriter()
+    {
+    }
+    /** Generate kernel component name */
+    virtual std::string get_name() const = 0;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+    /** Generate additional macros used in the component */
+    virtual std::string get_additional_macros() const
+    {
+        return "";
+    }
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
+    {
+        ARM_COMPUTE_UNUSED(comp_group);
+        return CLBuildOptions{};
+    }
+    /** Generate the component config id string used for tuning */
+    virtual std::string get_config_id() const
+    {
+        return "";
+    }
+    /** Generate the header list used in the component */
+    virtual std::set<std::string> get_headers_list() const
+    {
+        return std::set<std::string> {};
+    }
+    /** Generate the execution window for the component */
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+
+private:
+    ComponentId               _id{ -1 };
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
new file mode 100644
index 0000000000..870de64eb8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateDirectConv2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
+                                               const ArgumentPack<ITensorInfo> &tensors,
+                                               const Attributes                &attributes,
+                                               const Settings                  &settings)
+    : IGpuTemplateComponentWriter{ id, tensors },
+      _src{},
+      _weight{},
+      _bias{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
+}
+
+std::string ClTemplateDirectConv2d::get_name() const
+{
+    return "direct_conv2d";
+}
+
+std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const auto k0            = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
+    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
+
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// IN_1(wei)            {{weight}}
+)_";
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+// IN_1(bia)            {{bias}}
+)_";
+    }
+    code += R"_(
+// OUT(dst, accum)      {{dst}}
+
+// Initialize the accumulators
+TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH {{WEI_WIDTH}}
+#define _IWEI_HEIGHT {{WEI_HEIGHT}}
+#define _ISRC_WIDTH {{src}}_w
+#define _ISRC_HEIGHT {{src}}_h
+#define _ISRC_CHANNELS {{src}}_c
+#define _IDST_WIDTH {{arg_dst}}_w
+#define _IDST_HEIGHT {{arg_dst}}_h
+#define _IDST_CHANNELS {{arg_dst}}_c
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
+        yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+        xi[i].v -= {{PAD_LEFT}};
+        yi[i].v -= {{PAD_TOP}};
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        {{dst}}[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+            ck += K0;
+        }
+
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+)_";
+
+    if(leftover_loop)
+    {
+        code += R"_(
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+            ++ck;
+        }
+    )_";
+}
+
+code += R"_(
+#undef _I_WEI_WIDTH
+#undef _I_WEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+
+    }
+)_";
+
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+    )_";
+}
+
+code += R"_(
+}
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+    return code;
+}
+
+void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(
+        _weight,
+        GpuKernelArgumentInfo(weight_type),
+        comp_group.is_intermediate_tensor(_weight),
+        "weight");
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        vtable.declare_variable(
+            _bias,
+            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
+            comp_group.is_intermediate_tensor(_bias),
+            "bias");
+    }
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+    // Arguments and global shared variables
+    lut["src"]    = vtable.get_variable(_src);
+    lut["weight"] = vtable.get_variable(_weight);
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        lut["bias"]          = vtable.get_variable(_bias);
+        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
+    }
+    lut["dst"] = vtable.get_variable(_dst);
+
+    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    lut["arg_dst"]          = dst_argument.uniq_name;
+
+    // Local build options
+    lut["meta_kernel_id"] = id();
+    lut["ACC_DATA_TYPE"]  = _src->data_type();
+    lut["SRC_DATA_TYPE"]  = _src->data_type();
+    lut["WEI_DATA_TYPE"]  = _weight->data_type();
+
+    lut["SRC_TENSOR_TYPE"] = "BUFFER";
+    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+    {
+        lut["WEI_TENSOR_TYPE"] = "IMAGE";
+        break;
+    }
+        default:
+    {
+        lut["WEI_TENSOR_TYPE"] = "BUFFER";
+        break;
+    }
+    }
+    const auto width_idx  = 1;
+    const auto height_idx = 2;
+    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+
+    lut["STRIDE_X"] = _attributes.stride().x();
+    lut["STRIDE_Y"] = _attributes.stride().y();
+
+    lut["PAD_LEFT"] = _attributes.pad().left;
+    lut["PAD_TOP"]  = _attributes.pad().top;
+
+    lut["ZERO_VALUE"] = 0;
+
+    return lut;
+}
+
+CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
+{
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataType     data_type   = _src->data_type();
+
+    /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
+    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0               = root_window.x().step();
+    const unsigned int m0               = root_window.y().step();
+    const unsigned int k0               = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
+    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+    CLBuildOptions build_opts{};
+    if(_settings.fast_relaxed_math())
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+    else
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    build_opts.add_option("-DIS_TILED");
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClTemplateDirectConv2d::get_config_id() const
+{
+    const DataType   data_type   = _src->data_type();
+    const DataLayout data_layout = _src->data_layout();
+
+    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const unsigned int kernel_size = _weight->dimension(width_idx);
+
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(data_type));
+    config_id += "_";
+    config_id += support::cpp11::to_string(kernel_size);
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().x());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().y());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(data_layout));
+    return config_id;
+}
+
+std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
+{
+    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+}
+
+Window ClTemplateDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto output_shape = _dst->tensor_shape();
+
+    const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
+    const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+    // Create and configure kernel window
+    Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
new file mode 100644
index 0000000000..48027a9b8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    ClTemplateDirectConv2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+    /** Generate the component config id string used for tuning */
+    std::string get_config_id() const override;
+    /** Generate the header list used in the component */
+    std::set<std::string> get_headers_list() const override;
+    /** Generate the execution window for the component */
+    Window get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_weight;
+    const ITensorInfo *_bias;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
new file mode 100644
index 0000000000..6c4b8f52f2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateStore.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+
+std::string ClTemplateStore::get_name() const
+{
+    return "store";
+}
+
+std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    return R"_(
+//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+{
+// This also follows NHWC layout
+// g_ind_0 maps to global_id(0) maps to Channel
+// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window)
+// g_ind_2 maps to global_id(2) maps to N / Batch
+#define _IDST_WIDTH {{dst}}_w
+#define _IDST_HEIGHT {{dst}}_h
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+    //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+}
+
+)_";
+}
+
+void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    // ARM_COMPUTE_UNUSED(comp_group)
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+
+    // Arguments and global shared variables
+    lut["src"] = vtable.get_variable(_src);
+    lut["dst"] = vtable.get_variable(_dst);
+
+    // Local build options
+    lut["meta_kernel_id"]  = id();
+    lut["DST_TENSOR_TYPE"] = "BUFFER";
+    const auto dst_info    = comp_group.get_dst_tensors()[0];
+    lut["DST_DATA_TYPE"]   = dst_info->data_type();
+
+    return lut;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
new file mode 100644
index 0000000000..3f97a82204
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateStore final : public IGpuTemplateComponentWriter
+{
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateStore(const ClTemplateStore &store) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateStore(ClTemplateStore &&store) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
new file mode 100644
index 0000000000..cb643a741d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateWriter.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
+std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
+{
+    std::string replaced_code    = "";
+    bool        scanning_pattern = false;
+    std::string pattern_found    = "";
+    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    {
+        if(!scanning_pattern)
+        {
+            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            {
+                i += 1;
+                scanning_pattern = true;
+                pattern_found    = "";
+            }
+            else
+            {
+                replaced_code += code_template[i];
+            }
+        }
+        else
+        {
+            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            {
+                i += 1;
+                scanning_pattern = false;
+                std::string err  = "Pattern " + pattern_found + " not found in tags";
+                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
+                replaced_code += tags.find(pattern_found)->second.value;
+            }
+            else
+            {
+                pattern_found += code_template[i];
+            }
+        }
+    }
+
+    return replaced_code;
+}
+ClTemplateWriter::~ClTemplateWriter()
+{
+}
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
+    : _components{ components }
+{
+}
+std::string ClTemplateWriter::get_name()
+{
+    return write_kernel_name();
+}
+std::string ClTemplateWriter::get_code()
+{
+    return write_code();
+}
+std::string ClTemplateWriter::get_config_id()
+{
+    std::string config_id = get_name();
+    for(const auto &comp : _components)
+    {
+        config_id += "--" + comp->template_writer()->get_config_id() + "--";
+    }
+
+    return config_id;
+}
+
+CLBuildOptions ClTemplateWriter::get_build_options()
+{
+    CLBuildOptions build_opts{};
+
+    for(const auto &comp : _components)
+    {
+        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
+    }
+
+    return build_opts;
+}
+
+Window ClTemplateWriter::get_window() const
+{
+    const auto root_comp = _components.get_root_component();
+    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+    return root_comp->template_writer()->get_window();
+}
+
+std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
+{
+    // Assemble GpuKernelArguments
+    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
+    for(const auto t : _components.get_argument_tensors())
+    {
+        tensors.emplace(
+            t->id(),
+            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+    }
+    return tensors;
+}
+
+std::string ClTemplateWriter::write_code()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
+
+    // These data structures will hold the data from all the components in the blueprint
+    std::set<std::string>    headers_list{};
+    std::set<std::string>    additional_macros{};
+    std::vector<std::string> component_codes{}; // vector because order matters
+
+    // Pass 1: Declare all kernel variables
+    for(auto &component : _components)
+    {
+        component->template_writer()->declare_variables(_vtable, _components);
+    }
+    // Pass 2: Generate component codes
+    for(auto &component : _components)
+    {
+        const auto component_writer       = component->template_writer();
+        auto       curr_headers_list      = component_writer->get_headers_list();
+        auto       curr_additional_macros = component_writer->get_additional_macros();
+        auto       curr_component_code    = component_writer->get_component_code(_components);
+        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        component_codes.push_back(replace_tags(curr_component_code, var_lut));
+
+        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
+        if(!additional_macros.empty()) // Some components might not have any
+        {
+            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
+        }
+    }
+
+    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
+    std::string code = "";
+
+    for(auto &header : headers_list)
+    {
+#if defined(EMBEDDED_KERNELS)
+        code += CLKernelLibrary::get().get_program(header).first;
+#else  // defined(EMBEDDED_KERNELS)
+        code += "#include \"" + header + "\"\n";
+#endif // defined(EMBEDDED_KERNELS)
+    }
+
+    for(auto &macros : additional_macros)
+    {
+        code += macros;
+    }
+
+    code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors()));
+
+    code += "\n{\n\n";
+
+    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
+    code += write_global_section();
+    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+
+    for(const auto &component_code : component_codes)
+    {
+        code += component_code;
+    }
+
+    code += "}\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_global_section() const
+{
+    const auto dst_tensors = _components.get_dst_tensors();
+    ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
+    const auto dst_info   = dst_tensors[0];
+    const auto dst_w      = dst_info->dimension(0);
+    const auto tile_w     = std::max(1, get_window().x().step());
+    const auto tile_h     = std::max(1, get_window().y().step());
+    auto       leftover_w = dst_w % tile_w;
+
+    std::string code = "";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
+    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
+
+    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
+    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
+{
+    std::string code;
+    switch(var.kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
+        }
+    }
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
+{
+    std::string code = "\n__kernel void " + write_kernel_name() + "(";
+
+    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    {
+        code += write_argument_declaration(argument_list[i]) + ",";
+    }
+    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    {
+        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
+    }
+
+    code += ')';
+
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_name() const
+{
+    if(_components.empty())
+    {
+        return "empty_kernel";
+    }
+    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
+    for(size_t i = 1; i < _components.size(); ++i)
+    {
+        name += "___";
+        name += _components[i]->template_writer()->get_name();
+    }
+
+    return name;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
new file mode 100644
index 0000000000..83f617b6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use a templated-string-based method to write kernel code
+ *  It stitches the component code templates together based on the valid fusion configuration.
+ *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
+ */
+class ClTemplateWriter : public IGpuKernelWriter
+{
+public:
+    /** Instantiates a kernel code string from the kernel code template
+     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
+     *
+     * @param[in] code_template Kernel code template
+     * @param[in] tags          Tag lookup table
+     *
+     * @return std::string  Instantiated kernel string
+     */
+    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
+    /** Default constructor */
+    ClTemplateWriter() = default;
+    /** Constructor
+     *
+     * @param[in] components Kernel component group from which the kernel will be generated
+     */
+    ClTemplateWriter(const GpuKernelComponentGroup &components);
+    /** Destructor */
+    ~ClTemplateWriter() override;
+    /** Generate kernel name */
+    std::string get_name() override;
+    /** Generate kernel code */
+    std::string get_code() override;
+    /** Generate build options */
+    CLBuildOptions get_build_options() override;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    std::string get_config_id() override;
+    /** Generate execution window */
+    Window get_window() const override;
+    /** Get the kernel argument lists of the kernel*/
+    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+
+private:
+    std::string write_kernel_name() const;
+    std::string write_code();
+    std::string write_global_section() const;
+    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
+    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
+
+private:
+    GpuKernelComponentGroup _components{};
+    GpuKernelVariableTable  _vtable{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
new file mode 100644
index 0000000000..55eb4c5c77
--- /dev/null
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+
+#include "arm_compute/core/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+    return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+} // namespace
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph
+ *  Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ */
+class DependencyGraph
+{
+public:
+    using Id         = int32_t;
+    using TensorId   = Id;
+    using OperatorId = Id;
+    /** Adjacency list
+     *
+     */
+    using AdjList = std::map<Id, std::vector<Id>>;
+
+    /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+     *
+     */
+    struct OpPack
+    {
+        OperatorId            op{};
+        std::vector<TensorId> inputs{};
+        std::vector<TensorId> outputs{};
+        friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+        {
+            return std::make_tuple(
+                       opp0.op, opp0.inputs, opp0.outputs)
+                   == std::make_tuple(
+                       opp1.op, opp1.inputs, opp1.outputs);
+        }
+    };
+
+public:
+    DependencyGraph() = default;
+    friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &);
+
+    /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list
+     * @note The list is expected to only grow from head to tail
+     *
+     * PRECONDITION: The current graph is already linear
+     *
+     * @return true  If the operator can be added while keeping the graph as a linear sequence
+     * @return false  Otherwise
+     */
+    bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) const
+    {
+        ARM_COMPUTE_UNUSED(op, outputs);
+        if(all_ops().empty())
+        {
+            return true;
+        }
+        std::vector<TensorId> common_tensors{};
+        auto                  existing_tensors = all_tensors();
+        std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted
+        std::vector<TensorId> sorted_inputs{ inputs };
+        std::sort(sorted_inputs.begin(), sorted_inputs.end());
+        std::set_intersection(existing_tensors.begin(), existing_tensors.end(),
+                              sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors));
+        if(common_tensors.size() != 1U)
+        {
+            return false;
+        }
+        const auto linked_tensor = common_tensors[0];
+        const auto tail_ops      = get_dst_ops();
+        ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION
+        const auto tail = tail_ops[0];
+
+        if(!is_in(linked_tensor, dst_tensors(tail)))
+        {
+            return false;
+        }
+        return true;
+    }
+    /** Add an operator, while keeping the graph as a "linear sequence"
+     *
+     * PRECONDITION: The current graph is already linear
+     * INVARIANT: The list can only grow from head to tail
+     * INVARIANT: POSTCONDITION: The graph is linear
+     */
+    void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+    {
+        ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs));
+        auto success = add_operator(op, inputs, outputs);
+        ARM_COMPUTE_ERROR_ON(!success);
+    }
+    /** Add a new operator
+     *  Return invalid if it violates the DAG invariant
+     *  Invalid operation will not change the graph
+     *
+     * @param[in] op      Operator to add
+     * @param[in] inputs  Input tensors to the operator
+     * @param[in] outputs Output tensors to the operator
+     */
+    bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+    {
+        if(operator_exists(op))
+        {
+            return false;
+        }
+        _adj_src_tensors[op] = {};
+        _adj_dst_tensors[op] = {};
+        for(auto in_tensor : inputs)
+        {
+            // Linking input tensor to operator node will never create a cycle / loop because we guarantee
+            // each op is newly created, so every <input, op> pair / edge is new
+            link_input(op, in_tensor);
+        }
+        for(auto out_tensor : outputs)
+        {
+            // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
+            if(path_exists_from_tensor_to_op(out_tensor, op))
+            {
+                remove_operator(op);
+                return false;
+            }
+            else
+            {
+                link_output(op, out_tensor);
+            }
+        }
+
+        return true;
+    }
+
+    /** Sort the graph in a topological order
+     *
+     * @return std::vector<OpPack>
+     */
+    std::vector<OpPack> topological_sort() const
+    {
+        // Incident degree (number of source operators to an op)
+        std::map<OperatorId, unsigned int> in_degree{};
+        std::set<OperatorId>   visited_ops{};
+        std::deque<OperatorId> zero_in_degree_ops{};
+        std::vector<OpPack>    sorted_op_packs{};
+        for(auto op : all_ops())
+        {
+            const auto degree = src_ops(op).size();
+            in_degree[op]     = degree;
+            if(degree == 0)
+            {
+                zero_in_degree_ops.push_back(op);
+                visited_ops.insert(op);
+            }
+        }
+
+        while(!zero_in_degree_ops.empty())
+        {
+            const OperatorId op = zero_in_degree_ops.front();
+            zero_in_degree_ops.pop_front();
+            sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+
+            for(const auto next_op : dst_ops(op))
+            {
+                if(in_degree[next_op] > 0)
+                {
+                    in_degree[next_op]--;
+                }
+                if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
+                {
+                    zero_in_degree_ops.push_back(next_op);
+                    visited_ops.insert(op);
+                }
+            }
+        }
+
+        return sorted_op_packs;
+    }
+
+    void find_independent_paths_util(OperatorId op, std::vector<std::vector<OperatorId>> &paths, std::vector<OperatorId> cur_path,
+                                     const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        // We have found an unresolved dependency
+        if(in_degree.at(op) > 1)
+        {
+            paths.push_back(cur_path);
+            return;
+        }
+        const auto child_ops = dst_ops(op);
+
+        cur_path.push_back(op);
+        // Hit the leaf op
+        if(child_ops.empty())
+        {
+            paths.push_back(cur_path);
+            return;
+        }
+        for(const auto child_op : child_ops)
+        {
+            find_independent_paths_util(child_op, paths, cur_path, in_degree);
+        }
+    }
+    /** Find all independent linear paths from op, which doesn't depend on any other op
+     *
+     * @return std::vector<OpPack>
+     */
+    std::vector<std::vector<OperatorId>> find_independent_paths(OperatorId op,
+                                                                const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        std::vector<std::vector<OperatorId>> paths;
+        std::vector<OperatorId>              cur_path;
+        find_independent_paths_util(op, paths, cur_path, in_degree);
+        return paths;
+    }
+    /** Find a longest linear path from op, which doesn't depend on any other op
+     *
+     * @return std::vector<OpPack>
+     */
+    std::vector<OperatorId> find_longest_independent_path(OperatorId op,
+                                                          const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        const auto &paths = find_independent_paths(op, in_degree);
+        ARM_COMPUTE_ERROR_ON(paths.empty());
+        size_t                         max_len  = 0;
+        const std::vector<OperatorId> *max_path = nullptr;
+
+        for(const auto &path : paths)
+        {
+            if(path.size() >= max_len)
+            {
+                max_path = &path;
+                max_len  = path.size();
+            }
+        }
+        return *max_path;
+    }
+    std::vector<OperatorId> propose_next_path(std::set<OperatorId> &candidate_ops,
+                                              const std::map<OperatorId, unsigned int> &in_degree) const
+    {
+        if(candidate_ops.empty())
+        {
+            return {};
+        }
+        size_t                  max_len = 0;
+        std::vector<OperatorId> max_path;
+        OperatorId              chosen_op{};
+        for(auto op : candidate_ops)
+        {
+            const auto path = find_longest_independent_path(op, in_degree);
+            if(path.size() >= max_len)
+            {
+                chosen_op = op;
+                max_path  = path;
+                max_len   = path.size();
+            }
+        }
+        candidate_ops.erase(chosen_op);
+        return max_path;
+    }
+    /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize
+     *  the number of partitions
+     */
+    std::vector<std::vector<OpPack>> topological_partition() const
+    {
+        // Initialize zero incident degree and zero in degree ops
+        std::map<OperatorId, unsigned int> in_degree{};
+        std::set<OperatorId> candidate_ops{};
+        for(auto op : all_ops())
+        {
+            const auto degree = src_ops(op).size();
+            in_degree[op]     = degree;
+            if(degree == 0)
+            {
+                candidate_ops.insert(op);
+            }
+        }
+
+        std::vector<std::vector<OpPack>> sorted_partitions{};
+        while(!candidate_ops.empty())
+        {
+            // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops)
+            const auto path = propose_next_path(candidate_ops, in_degree);
+
+            // Append to sorted_partitions
+            std::vector<OpPack> path_op_pack{};
+            for(auto op : path)
+            {
+                path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+            }
+            sorted_partitions.push_back(path_op_pack);
+            // Remove whole path (Update in_degree, visited_ops, candidate_ops)
+            for(auto op : path)
+            {
+                for(const auto next_op_child : dst_ops(op))
+                {
+                    if(in_degree[next_op_child] > 0)
+                    {
+                        in_degree[next_op_child]--;
+                    }
+                    if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates
+                    {
+                        candidate_ops.insert(next_op_child);
+                    }
+                }
+            }
+        }
+        return sorted_partitions;
+    }
+
+    /** Strict equality comparison (all internal ids and order of insertion matter).
+     *        In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+     *
+     *
+     * @param[in] g0
+     * @param[in] g1
+     * @return true  If the same
+     * @return false Otherwise
+     */
+    friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+    {
+        // Do not compare id allocators
+        return std::make_tuple(
+                   g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops)
+               == std::make_tuple(
+                   g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+    }
+    std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
+    {
+        return _adj_src_ops.at(tensor);
+    }
+    std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const
+    {
+        return _adj_dst_ops.at(tensor);
+    }
+    /** Get all tensors
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> all_tensors() const
+    {
+        std::vector<TensorId> tensors{};
+        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
+        {
+            return it.first;
+        });
+        return tensors;
+    }
+    /** Get source tensors of the whole graph
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> global_src_tensors() const
+    {
+        std::vector<TensorId> tensors;
+        for(auto tensor_src_ops : _adj_src_ops)
+        {
+            if(tensor_src_ops.second.empty())
+            {
+                tensors.push_back(tensor_src_ops.first);
+            }
+        }
+        return tensors;
+    }
+    /** Get destination tensors of the whole graph
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> global_dst_tensors() const
+    {
+        std::vector<TensorId> tensors;
+        for(auto tensor_dst_ops : _adj_dst_ops)
+        {
+            if(tensor_dst_ops.second.empty())
+            {
+                tensors.push_back(tensor_dst_ops.first);
+            }
+        }
+        return tensors;
+    }
+    /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+     *
+     * @return std::vector<OperatorId>
+     */
+    std::vector<OperatorId> get_root_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        const auto              op_list = all_ops();
+
+        for(auto op : op_list)
+        {
+            if(src_ops(op).empty())
+            {
+                ops.emplace_back(op);
+            }
+        }
+        return ops;
+    }
+
+private:
+    void link_input(OperatorId op, TensorId in_tensor)
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        if(!tensor_exists(in_tensor))
+        {
+            insert_new_tensor(in_tensor);
+        }
+        ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking
+        _adj_src_tensors[op].push_back(in_tensor);
+        _adj_dst_ops[in_tensor].push_back(op);
+    }
+    void link_output(OperatorId op, TensorId out_tensor)
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        if(!tensor_exists(out_tensor))
+        {
+            insert_new_tensor(out_tensor);
+        }
+        ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking
+        _adj_dst_tensors[op].push_back(out_tensor);
+        _adj_src_ops[out_tensor].push_back(op);
+    }
+
+    std::vector<OperatorId> src_ops(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        std::vector<OperatorId> ops{};
+        for(TensorId src_tensor : src_tensors(op))
+        {
+            ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+        }
+        return ops;
+    }
+    std::vector<OperatorId> dst_ops(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        std::vector<OperatorId> ops{};
+        for(TensorId dst_tensor : _adj_dst_tensors.at(op))
+        {
+            ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+        }
+        return ops;
+    }
+
+    /** Get source tensors to an operator
+     *
+     * @param[in] op
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> src_tensors(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        return _adj_src_tensors.at(op);
+    }
+    /** Get destination tensors to an operator
+     *
+     * @param[in] op
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> dst_tensors(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        return _adj_dst_tensors.at(op);
+    }
+    /** Get all operators
+     *
+     * @return std::vector<OperatorId>
+     */
+    std::vector<OperatorId> all_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
+        {
+            return it.first;
+        });
+        return ops;
+    }
+    /** Remove an operator from graph.
+     *
+     * @param[in] op
+     */
+    void remove_operator(OperatorId op)
+    {
+        for(auto src_tensor : _adj_src_tensors.at(op))
+        {
+            auto &dst_ops = _adj_dst_ops.at(src_tensor);
+            dst_ops.erase(
+                std::remove(std::begin(dst_ops), std::end(dst_ops), op),
+                std::end(dst_ops));
+        }
+        for(auto dst_tensor : _adj_dst_tensors.at(op))
+        {
+            auto &src_ops = _adj_src_ops.at(dst_tensor);
+            src_ops.erase(
+                std::remove(std::begin(src_ops), std::end(src_ops), op),
+                std::end(src_ops));
+        }
+        // Remove any isolated tensors
+        // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
+        for(auto t : all_tensors())
+        {
+            if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+            {
+                _adj_src_ops.erase(t);
+                _adj_dst_ops.erase(t);
+            }
+        }
+        _adj_src_tensors.erase(op);
+        _adj_dst_tensors.erase(op);
+    }
+    void insert_new_tensor(TensorId tensor)
+    {
+        _adj_src_ops[tensor] = {};
+        _adj_dst_ops[tensor] = {};
+    }
+    bool tensor_exists(TensorId tensor) const
+    {
+        return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+    }
+    bool operator_exists(OperatorId op) const
+    {
+        return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+    }
+    bool is_src_tensor_of(OperatorId op, TensorId tensor) const
+    {
+        if(!operator_exists(op) || !tensor_exists(tensor))
+        {
+            return false;
+        }
+        const auto op_inputs = src_tensors(op);
+        return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+    }
+    bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
+    {
+        if(!operator_exists(op) || !tensor_exists(tensor))
+        {
+            return false;
+        }
+        const auto op_outputs = dst_tensors(op);
+        return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+    }
+    bool are_connected(OperatorId op, TensorId tensor) const
+    {
+        return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+    }
+    /** If op is the destination / leaf operator of the whole graph
+     *
+     * @param[in] op
+     * @return true
+     * @return false
+     */
+    bool is_dst_op(OperatorId op) const
+    {
+        return dst_ops(op).empty();
+    }
+    std::vector<OperatorId> get_dst_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        const auto              op_list = all_ops();
+
+        for(auto op : op_list)
+        {
+            if(is_dst_op(op))
+            {
+                ops.emplace_back(op);
+            }
+        }
+        return ops;
+    }
+    bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
+    {
+        if(!tensor_exists(src_tensor) || !operator_exists(dst_op))
+        {
+            return false;
+        }
+        for(auto child_op : dst_ops_from_tensor(src_tensor))
+        {
+            if(path_exists_from_op_to_op(child_op, dst_op))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
+    {
+        if(!operator_exists(src_op) || !operator_exists(dst_op))
+        {
+            return false;
+        }
+        if(src_op == dst_op)
+        {
+            return true;
+        }
+        if(is_in(src_op, get_dst_ops()))
+        {
+            return false;
+        }
+        for(auto child_tensor : dst_tensors(src_op))
+        {
+            if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+private:
+    AdjList _adj_src_tensors{};
+    AdjList _adj_dst_tensors{};
+    AdjList _adj_src_ops{};
+    AdjList _adj_dst_ops{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */
diff --git a/tests/SConscript b/tests/SConscript
index b848f27043..95ecd27afa 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -121,7 +121,7 @@ files_validation += Glob('validation/CPP/' + filter_pattern)
 if env['opencl']:
     if env['experimental_dynamic_fusion']:
         test_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
-        files_validation += Glob('validation/CL/UNIT/dynamic_fusion/' + filter_pattern)
+        files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern)
 
     filter_pattern = test_env['test_filter']
 
@@ -291,19 +291,6 @@ if test_env['benchmark_examples']:
                 prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
                 arm_compute_benchmark_examples += [ prog ]
 
-        # Dynamic fusion examples
-        if env['opencl']:
-            if env['experimental_dynamic_fusion']:
-                for file in Glob("%s/dynamic_fusion/*.cpp" % examples_folder):
-                    example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0])
-                    if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+[load_whole_archive, arm_compute_lib, noload_whole_archive] + bm_link_flags + extra_link_flags)
-                        arm_compute_benchmark_examples += [ prog ]
-                    else:
-                        #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
-                        arm_compute_benchmark_examples += [ prog ]
-
     arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples)
     Depends(arm_compute_benchmark_examples, arm_compute_test_framework)
     Depends(arm_compute_benchmark_examples, arm_compute_lib)
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
new file mode 100644
index 0000000000..87720a629d
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Conv2d, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d1x1(direct_conv)(input, weights, bias)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Fuse conv2d
+    Conv2dAttributes conv2d_attr{};
+    auto             input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    auto             weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    auto             dst_info    = sketch.create_tensor_info();
+    GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for(auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = data.first;
+        AuxMemoryInfo aux_mem_req = data.second;
+        tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_input{};
+    CLTensor t_weight{};
+    CLTensor t_dst{};
+
+    // Initialize user tensors
+    t_input.allocator()->init(input_info);
+    t_weight.allocator()->init(weight_info);
+    t_dst.allocator()->init(dst_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_input.allocator()->allocate();
+    t_weight.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    fill<float>(CLAccessor(t_weight), 1, library.get());
+
+    // Run runtime
+    runtime.run({ &t_input, &t_weight, &t_dst });
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+    fill<float>(ref_t_weight, 1, library.get());
+
+    auto ref_t_input_nchw            = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto ref_t_weight_nchw           = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+    auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw            = t_dst_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left, conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+                                    DimensionRoundingType{});
+    auto       ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
+    const auto ref_t_dst      = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+TEST_SUITE(Invalid_Fusion_Should_Fail)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+    auto       t_input_info   = TensorInfo(t_input_shape, 1, data_type, data_layout);
+    auto       t_weight_info  = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+    auto       t_dst_info     = TensorInfo();
+
+    Conv2dAttributes conv2d_attr{};
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Create tensor infos
+    auto input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    auto weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    auto dst_info    = sketch.create_tensor_info();
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator
+        const auto success = GpuConv2d::validate_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+        ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
+
+        GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+    }
+
+    // Create tensor infos
+    auto weight_info_2 = sketch.create_tensor_info(t_weight_info);
+    auto dst_info_2    = sketch.create_tensor_info();
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator, should fail
+        const auto success = GpuConv2d::validate_op(sketch, &dst_info, &weight_info_2, nullptr, &dst_info_2, conv2d_attr);
+        ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+    }
+}
+TEST_SUITE_END() // Invalid_Fusion_Should_Fail
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute