111 files changed, 15752 insertions, 0 deletions
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
new file mode 100644
index 0000000000..eab5cddd07
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClKernelRuntime.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using namespace arm_compute::opencl;
+
+void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
+{
+    // Create kernel from kernel source string
+    opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(
+                              code.name(),
+                              code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
+                              // Each program contains exactly one kernel
+                              code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                              code.build_options().options(), false /* Is source binary */));
+
+    // Configure execution window
+    IClKernel::configure_internal(code.window());
+
+    // Set config id for lws tuning
+    _config_id = code.config_id();
+
+    // Set kernel arguments
+    _arguments = code.arguments();
+}
+
+inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
+                                                 const GpuKernelArgumentBinding &arg,
+                                                 const ICLTensor                *tensor,
+                                                 std::vector<cl::Image2D>       &cl_images)
+{
+    switch (arg.type())
+    {
+        case GpuKernelArgumentBinding::Type::TensorStorage:
+        {
+            switch (arg.tensor_storage_type())
+            {
+                case TensorStorageType::ClBufferUint8Ptr:
+                {
+                    cl_add_buffer_argument(_kernel, idx, tensor->cl_buffer());
+                    break;
+                }
+                case TensorStorageType::ClImage2dReadOnly:
+                {
+                    cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::ReadOnly);
+                    cl_images.push_back(tensor_image2d);
+                    cl_add_texture_argument(_kernel, idx, tensor_image2d);
+                    break;
+                }
+                case TensorStorageType::ClImage2dWriteOnly:
+                {
+                    cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::WriteOnly);
+                    cl_images.push_back(tensor_image2d);
+                    cl_add_texture_argument(_kernel, idx, tensor_image2d);
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Do not accept other TensorStorageType");
+                    break;
+                }
+            }
+            break;
+        }
+        case GpuKernelArgumentBinding::Type::TensorComponent:
+        {
+            cl_add_tensor_component_argument(_kernel, idx, tensor, arg.tensor_component_type());
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Do not accept other types of kernel arguments");
+            break;
+        }
+    }
+}
+
+void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
+    /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
+    constexpr bool skip_sliding_window  = false;
+    constexpr bool use_dummy_work_items = false;
+
+    unsigned int idx = 0;
+    do
+    {
+        // Set kernel arguments
+        // CLImages created from tensor arguments. Need to be retained until enqueue
+        std::vector<cl::Image2D> cl_images;
+
+        for (const auto &arg : _arguments)
+        {
+            auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+            add_kernel_argument(idx, arg, tensor, cl_images);
+        }
+
+        // Dispatch kernel
+        enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
+    } while (skip_sliding_window && window.slide_window_slice_3D(slice));
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
new file mode 100644
index 0000000000..148e4db581
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelSourceCode;
+
+/** OpenCL runtime to run a single kernel */
+class ClKernelRuntime final : public opencl::IClKernel
+{
+public:
+    /** Configure the kernel runtime
+     *
+     * @param[in] compile_ctx OpenCL compile context
+     * @param[in] code        Kernel source code
+     */
+    void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code);
+    /** Run the kernel
+     *
+     * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories
+     * @param[in]     window  Execution window
+     * @param[in]     queue   OpenCL command queue
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    /** Set a kernel argument as part of a tensor
+     *
+     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     arg       Kernel argument binding, as part of @p tensor
+     * @param[in]     tensor    Tensor of which the kernel argument @p arg is a part of
+     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+     */
+    inline void add_kernel_argument(unsigned int                   &idx,
+                                    const GpuKernelArgumentBinding &arg,
+                                    const ICLTensor                *tensor,
+                                    std::vector<cl::Image2D>       &cl_images);
+
+private:
+    GpuKernelArgumentList _arguments{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
new file mode 100644
index 0000000000..3500a0e60d
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
+ */
+class ClAuxTensors
+{
+public:
+    /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
+     */
+    struct DataView
+    {
+        DataView() = default;
+        DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
+        {
+        }
+        ~DataView()                                = default;
+        DataView(const DataView &other)            = default;
+        DataView &operator=(const DataView &other) = default;
+        DataView(DataView &&other)                 = default;
+        DataView     &operator=(DataView &&other)  = default;
+        CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
+        TensorInfo    tensor_info{}; /**< Associated tensor info */
+        AuxMemoryInfo memory_info{}; /**< Memory requirement */
+    };
+
+    /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
+    std::vector<DataView> get_tensors()
+    {
+        return _tensors;
+    }
+    std::vector<DataView> get_tensors() const
+    {
+        return _tensors;
+    }
+
+    friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
+
+private:
+    /** Add auxiliary tensor.
+     *
+     * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
+     * @param[in] memory_info Memory requirements of the auxiliary tensor
+     *
+     * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr
+     */
+    CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
+    {
+        const auto t_id             = tensor_info.id();
+        auto       find_tensor_pair = _owned_tensors.find(t_id);
+        if (find_tensor_pair != _owned_tensors.end())
+        {
+            return find_tensor_pair->second.get();
+        }
+        else
+        {
+            auto tensor        = std::make_unique<CLTensor>();
+            auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
+            auto new_tensor    = inserted_pair->second.get();
+            _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
+            return new_tensor;
+        }
+    }
+
+    std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
+    std::vector<DataView>                                _tensors{};
+};
+/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
+ *
+ * @note This is the only recommended method for user to create @ref ClAuxTensors
+ *
+ * @param[out] aux_tensors Auxiliary tensors required by the workload code
+ * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to
+ *
+ * @return Status
+ */
+Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
+{
+    for (auto t_id : code.tensors())
+    {
+        // Get tensor object
+        const auto workload_arg  = code.query_tensor(t_id);
+        ICLTensor *tensor_object = nullptr;
+        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        {
+            // Create aux tensor CLTensor object
+            const TensorInfo tensor_info = *workload_arg->tensor_info();
+            ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
+            const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
+            tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
+
+            if (tensor_object == nullptr)
+            {
+                return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
+            }
+        }
+    }
+    return Status{};
+}
+
+/** A fast tensor lookup table for runtime tensor objects retrieval
+ */
+class ClTensorLUT
+{
+public:
+    /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        auto tensor_pack = _tensor_packs.find(uwk_id);
+        if (tensor_pack != _tensor_packs.end())
+        {
+            return &(tensor_pack->second);
+        }
+        return nullptr;
+    }
+    /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        return _tensor_packs.at(uwk_id);
+    }
+
+    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                                    const GpuWorkloadSourceCode   &code,
+                                    const std::vector<CLTensor *> &user_tensors,
+                                    const ClAuxTensors            &aux_tensors);
+
+private:
+    /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack
+     * @param[in] tensor_pack Tensor pack to be added
+     */
+    void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
+    {
+        _tensor_packs[uwk_id] = tensor_pack;
+    }
+    std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
+};
+
+/** Create a fast tensor lookup table for runtime tensor retrieval
+ *
+ * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
+ * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to
+ * @param[in]  user_tensors User tensors
+ * @param[in]  aux_tensors  Auxiliary tensors required by the workload code
+ *
+ * @return Status
+ */
+Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                         const GpuWorkloadSourceCode   &code,
+                         const std::vector<CLTensor *> &user_tensors,
+                         const ClAuxTensors            &aux_tensors)
+{
+    // Combine user tensors and aux tensors
+    std::map<ITensorInfo::Id, CLTensor *> tensor_map;
+    for (auto tensor : user_tensors)
+    {
+        const auto t_id = tensor->info()->id();
+
+        if (tensor_map.find(t_id) != tensor_map.end())
+        {
+            // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
+            std::vector<ITensorInfo::Id> ids;
+            for (auto &t : tensor_map)
+            {
+                ids.push_back(t.first);
+            }
+            ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;
+            tensor_map[new_id]     = tensor;
+        }
+        else
+        {
+            tensor_map[t_id] = tensor;
+        }
+    }
+    for (const auto &data : aux_tensors.get_tensors())
+    {
+        const auto t_id   = data.tensor_info.id();
+        const auto tensor = data.tensor;
+        if (tensor_map.find(t_id) != tensor_map.end())
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+        }
+        tensor_map[t_id] = tensor;
+    }
+
+    // Add tensor objects into corresponding tensor packs
+    for (auto id_tensor : tensor_map)
+    {
+        const auto t_id          = id_tensor.first;
+        const auto tensor_object = id_tensor.second;
+        if (tensor_object == nullptr)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+        }
+        if (tensor_object->allocator()->info().total_size() == 0U)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
+        }
+
+        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        {
+            ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
+            if (tensor_pack == nullptr)
+            {
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
+            }
+            else
+            {
+                tensor_pack->add_tensor(t_id, tensor_object);
+            }
+        }
+    }
+
+    return Status{};
+}
+
+} // namespace
+
+struct ClWorkloadRuntime::Implementation
+{
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
+    bool                                                       _is_configured{false};
+    bool                                                       _is_prepared{false};
+    ClTensorLUT                                                _tensor_lut{};
+    ClAuxTensors                                               _aux_tensors{};
+    GpuWorkloadSourceCode                                      _source_code{};
+};
+
+ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
+{
+}
+
+ClWorkloadRuntime::~ClWorkloadRuntime() = default;
+
+ClWorkloadRuntime::ClWorkloadRuntime(ClWorkloadRuntime &&) = default;
+
+ClWorkloadRuntime &ClWorkloadRuntime::operator=(ClWorkloadRuntime &&) = default;
+
+Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
+                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    // Generate source code
+    _impl->_source_code = sketch.implementation().generate_source_code();
+    // Configure unit workload from source code
+    for (auto uwk_id : _impl->_source_code.unit_workloads())
+    {
+        const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
+        const auto stage = work.stage().stage;
+        auto       k     = std::make_unique<ClKernelRuntime>();
+        k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
+
+        switch (stage)
+        {
+            case UnitWorkloadStage::Stage::Run:
+            {
+                _impl->_kernels.emplace(work.id(), std::move(k));
+                break;
+            }
+            case UnitWorkloadStage::Stage::Prepare:
+            {
+                _impl->_kernels_prep.emplace(work.id(), std::move(k));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Invalid unit workload stage");
+            }
+        }
+    }
+    // Create auxiliary tensor objects
+    create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
+    _impl->_is_configured = true;
+    return Status{};
+}
+
+void ClWorkloadRuntime::prepare()
+{
+    if (!_impl->_is_prepared)
+    {
+        for (auto &id_kernel_pair : _impl->_kernels_prep)
+        {
+            const bool flush_queue = false;
+            const auto uwk_id      = id_kernel_pair.first;
+            auto       kernel      = id_kernel_pair.second.get();
+            CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+        }
+
+        _impl->_is_prepared = true;
+    }
+}
+
+Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
+{
+    // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
+    // in which case the lut can be cached during prepare
+    const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    prepare();
+    for (auto &id_kernel_pair : _impl->_kernels)
+    {
+        // Flush the command queue on the last kernel
+        const bool flush_queue = false;
+        const auto uwk_id      = id_kernel_pair.first;
+        auto       kernel      = id_kernel_pair.second.get();
+        CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+    }
+    return Status{};
+}
+
+std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
+{
+    std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
+    for (const auto &data : _impl->_aux_tensors.get_tensors())
+    {
+        aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
+    }
+    return aux_tensors;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
new file mode 100644
index 0000000000..7044b0ea66
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "GpuCkwKernelArgumentsHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const auto *info    = tensor->info();
+    const auto &strides = info->strides_in_bytes();
+
+    switch (component)
+    {
+        case TensorComponentType::OffsetFirstElement:
+            kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());
+            break;
+        case TensorComponentType::Stride0:
+            kernel.setArg<cl_uint>(idx++, strides[0]);
+            break;
+        case TensorComponentType::Stride1:
+            kernel.setArg<cl_uint>(idx++, strides[1]);
+            break;
+        case TensorComponentType::Stride2:
+            kernel.setArg<cl_uint>(idx++, strides[2]);
+            break;
+        case TensorComponentType::Stride3:
+            kernel.setArg<cl_uint>(idx++, strides[3]);
+            break;
+        case TensorComponentType::Stride4:
+            kernel.setArg<cl_uint>(idx++, strides[4]);
+            break;
+        case TensorComponentType::Dim0:
+            kernel.setArg<cl_uint>(idx++, info->dimension(0));
+            break;
+        case TensorComponentType::Dim1:
+            kernel.setArg<cl_uint>(idx++, info->dimension(1));
+            break;
+        case TensorComponentType::Dim2:
+            kernel.setArg<cl_uint>(idx++, info->dimension(2));
+            break;
+        case TensorComponentType::Dim3:
+            kernel.setArg<cl_uint>(idx++, info->dimension(3));
+            break;
+        case TensorComponentType::Dim4:
+            kernel.setArg<cl_uint>(idx++, info->dimension(4));
+            break;
+        case TensorComponentType::Dim1xDim2:
+            kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2));
+            break;
+        case TensorComponentType::Dim2xDim3:
+            kernel.setArg<cl_uint>(idx++, info->dimension(2) * info->dimension(3));
+            break;
+        case TensorComponentType::Dim1xDim2xDim3:
+            kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2) * info->dimension(3));
+            break;
+        case TensorComponentType::Unknown:
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor component");
+    }
+}
+
+void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buffer &buffer)
+{
+    kernel.setArg(idx++, buffer);
+}
+
+void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image)
+{
+    kernel.setArg(idx++, image);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
new file mode 100644
index 0000000000..306d547acb
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Select a Compute Kernel Writer tensor component from a tensor and add to the kernel's arguments at the specified index idx.
+ *
+ * @param[in,out] kernel    OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx       Index at which to add the argument.
+ * @param[in]     tensor    Tensor from which to access the tensor component.
+ * @param[in]     component Tensor component to select such as tensor dimensions, strides, etc.
+ */
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component);
+
+/** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx    Index at which to add the argument.
+ * @param[in]     buffer OpenCL buffer containing the tensor's data.
+ */
+void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buffer &buffer);
+
+/** Add an OpenCL image object to the kernel's arguments at the specified index @p idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx    Index at which to add the argument.
+ * @param[in]     image  OpenCL image containing the image's data.
+ */
+void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS */
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
new file mode 100644
index 0000000000..d030bc3d45
--- /dev/null
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types
+ * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor
+ *
+ * The argument id is the position of the argument within the pack, and is represented by @ref TensorType
+ *
+ * @tparam T Tensor-related type
+ */
+template <typename T>
+class ArgumentPack
+{
+public:
+    /** @ref arm_compute::TensorType encodes the position of a tensor argument within the pack */
+    using Id = TensorType;
+    /** A single argument element within the pack
+     * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time
+     */
+    struct PackElement
+    {
+        PackElement()                                   = default;
+        PackElement(const PackElement &elem)            = default;
+        PackElement &operator=(const PackElement &elem) = default;
+        PackElement(PackElement &&elem)                 = default;
+        PackElement &operator=(PackElement &&elem)      = default;
+        PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        Id       id{ACL_UNKNOWN};  /**< Argument id within the pack */
+        T       *tensor{nullptr};  /**< Non-const pointer to tensor-related object */
+        const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */
+    };
+
+public:
+    /** Default constructor */
+    ArgumentPack() = default;
+    /** Destructor */
+    ~ArgumentPack() = default;
+    /** Allow instances of this class to be copy constructed */
+    ArgumentPack<T>(const ArgumentPack<T> &other) = default;
+    /** Allow instances of this class to be copied */
+    ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default;
+    /** Allow instances of this class to be move constructed */
+    ArgumentPack<T>(ArgumentPack<T> &&other) = default;
+    /** Allow instances of this class to be moved */
+    ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
+    /** Initializer list Constructor */
+    ArgumentPack(const std::initializer_list<PackElement> &l) : _pack{}
+    {
+        for (const auto &e : l)
+        {
+            _pack[e.id] = e;
+        }
+    }
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(Id id, T *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_const_tensor(Id id, const T *tensor)
+    {
+        _pack[id] = PackElement(id, tensor);
+    }
+    /** Get tensor of a given id from the pack
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    T *get_tensor(Id id)
+    {
+        auto it = _pack.find(id);
+        return it != _pack.end() ? it->second.tensor : nullptr;
+    }
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor (const or not) if exist else nullptr
+     */
+    const T *get_const_tensor(Id id) const
+    {
+        auto it = _pack.find(id);
+        if (it != _pack.end())
+        {
+            return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+        }
+        return nullptr;
+    }
+    /** Remove the tensor stored with the given id
+     *
+     * @param[in] id ID of tensor to remove
+     */
+    void remove_tensor(Id id)
+    {
+        _pack.erase(id);
+    }
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const
+    {
+        return _pack.size();
+    }
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const
+    {
+        return _pack.empty();
+    }
+    /** Get the ACL_SRC_* tensors
+     *
+     * @return std::vector<T *>
+     */
+    std::vector<T *> get_src_tensors()
+    {
+        std::vector<T *> src_tensors{};
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_tensor(static_cast<TensorType>(id));
+            if (tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the const ACL_SRC_* tensors
+     *
+     * @return std::vector<const T *>
+     */
+    std::vector<const T *> get_const_src_tensors() const
+    {
+        std::vector<const T *> src_tensors{};
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        {
+            auto tensor = get_const_tensor(static_cast<TensorType>(id));
+            if (tensor != nullptr)
+            {
+                src_tensors.push_back(tensor);
+            }
+        }
+        return src_tensors;
+    }
+    /** Get the ACL_DST_* tensors
+     *
+     * @return std::vector<T *>
+     */
+    std::vector<T *> get_dst_tensors()
+    {
+        std::vector<T *> dst_tensors{};
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_tensor(static_cast<TensorType>(id));
+            if (tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+    /** Get the const ACL_DST_* tensors
+     *
+     * @return std::vector<const T *>
+     */
+    std::vector<const T *> get_const_dst_tensors() const
+    {
+        std::vector<const T *> dst_tensors{};
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        {
+            auto tensor = get_const_tensor(static_cast<TensorType>(id));
+            if (tensor != nullptr)
+            {
+                dst_tensors.push_back(tensor);
+            }
+        }
+        return dst_tensors;
+    }
+
+private:
+    std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
diff --git a/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp b/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp
new file mode 100644
index 0000000000..4ad94268f4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+CastAttributes &CastAttributes::data_type(const DataType &data_type)
+{
+    _data_type = data_type;
+    return *this;
+}
+
+DataType CastAttributes::data_type() const
+{
+    return _data_type;
+}
+
+CastAttributes &CastAttributes::convert_policy(const ConvertPolicy &convert_policy)
+{
+    _convert_policy = convert_policy;
+    return *this;
+}
+
+ConvertPolicy CastAttributes::convert_policy() const
+{
+    return _convert_policy;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp
new file mode 100644
index 0000000000..b177f760df
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+ClampAttributes &ClampAttributes::min_val(const float &min_val)
+{
+    _min_val = min_val;
+    return *this;
+}
+
+float ClampAttributes::min_val() const
+{
+    return _min_val;
+}
+
+ClampAttributes &ClampAttributes::max_val(const float &max_val)
+{
+    _max_val = max_val;
+    return *this;
+}
+
+float ClampAttributes::max_val() const
+{
+    return _max_val;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp
new file mode 100644
index 0000000000..97e74f742d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad)
+{
+    _pad = pad;
+    return *this;
+}
+Padding2D Conv2dAttributes::pad() const
+{
+    return _pad;
+}
+Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride)
+{
+    _stride = stride;
+    return *this;
+}
+Size2D Conv2dAttributes::stride() const
+{
+    return _stride;
+}
+Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation)
+{
+    _dilation = dilation;
+    return *this;
+}
+Size2D Conv2dAttributes::dilation() const
+{
+    return _dilation;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
new file mode 100644
index 0000000000..6f3816568c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::pad(const Padding2D &pad)
+{
+    _pad = pad;
+    return *this;
+}
+Padding2D DepthwiseConv2dAttributes::pad() const
+{
+    return _pad;
+}
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::stride(const Size2D &stride)
+{
+    _stride = stride;
+    return *this;
+}
+Size2D DepthwiseConv2dAttributes::stride() const
+{
+    return _stride;
+}
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dilation(const Size2D &dilation)
+{
+    _dilation = dilation;
+    return *this;
+}
+Size2D DepthwiseConv2dAttributes::dilation() const
+{
+    return _dilation;
+}
+
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::depth_multiplier(const uint32_t &depth_multiplier)
+{
+    _depth_multiplier = depth_multiplier;
+    return *this;
+}
+
+uint32_t DepthwiseConv2dAttributes::depth_multiplier() const
+{
+    return _depth_multiplier;
+}
+
+DepthwiseConv2dAttributes &
+DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
+{
+    _dimension_rounding_type = dimension_rounding_type;
+    return *this;
+}
+
+DimensionRoundingType DepthwiseConv2dAttributes::dimension_rounding_type() const
+{
+    return _dimension_rounding_type;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp
new file mode 100644
index 0000000000..027b550377
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+MatMulAttributes MatMulAttributes::adj_lhs(bool adj_lhs)
+{
+    _adj_lhs = adj_lhs;
+    return *this;
+}
+MatMulAttributes MatMulAttributes::adj_rhs(bool adj_rhs)
+{
+    _adj_rhs = adj_rhs;
+    return *this;
+}
+bool MatMulAttributes::adj_lhs() const
+{
+    return _adj_lhs;
+}
+bool MatMulAttributes::adj_rhs() const
+{
+    return _adj_rhs;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
new file mode 100644
index 0000000000..80f65f926a
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+PoolingType Pool2dAttributes::pool_type() const
+{
+    return _pool_type;
+}
+
+Pool2dAttributes Pool2dAttributes::pool_type(PoolingType pool_type)
+{
+    _pool_type = pool_type;
+    return *this;
+}
+
+Padding2D Pool2dAttributes::pad() const
+{
+    return _pad;
+}
+
+Pool2dAttributes Pool2dAttributes::pad(const Padding2D &pad)
+{
+    _pad = pad;
+    return *this;
+}
+
+Size2D Pool2dAttributes::pool_size() const
+{
+    return _pool_size;
+}
+
+Pool2dAttributes Pool2dAttributes::pool_size(const Size2D &pool_size)
+{
+    _pool_size = pool_size;
+    return *this;
+}
+
+Size2D Pool2dAttributes::stride() const
+{
+    return _stride;
+}
+
+Pool2dAttributes Pool2dAttributes::stride(const Size2D &stride)
+{
+    _stride = stride;
+    return *this;
+}
+
+bool Pool2dAttributes::exclude_padding() const
+{
+    return _exclude_padding;
+}
+
+Pool2dAttributes Pool2dAttributes::exclude_padding(bool exclude_padding)
+{
+    _exclude_padding = exclude_padding;
+    return *this;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp
new file mode 100644
index 0000000000..0938c0df84
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ReshapeAttributes &ReshapeAttributes::shape(const TensorShape &shape)
+{
+    _shape = shape;
+    return *this;
+}
+TensorShape ReshapeAttributes::shape() const
+{
+    return _shape;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp
new file mode 100644
index 0000000000..1919dbc72d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ResizeAttributes &ResizeAttributes::output_width(int32_t output_width)
+{
+    _output_width = output_width;
+    return *this;
+}
+
+int32_t ResizeAttributes::output_width() const
+{
+    return _output_width;
+}
+
+ResizeAttributes &ResizeAttributes::output_height(int32_t output_height)
+{
+    _output_height = output_height;
+    return *this;
+}
+
+int32_t ResizeAttributes::output_height() const
+{
+    return _output_height;
+}
+
+ResizeAttributes &ResizeAttributes::interpolation_policy(InterpolationPolicy interpolation_policy)
+{
+    _interpolation_policy = interpolation_policy;
+    return *this;
+}
+
+InterpolationPolicy ResizeAttributes::interpolation_policy() const
+{
+    return _interpolation_policy;
+}
+
+ResizeAttributes &ResizeAttributes::sampling_policy(SamplingPolicy sampling_policy)
+{
+    _sampling_policy = sampling_policy;
+    return *this;
+}
+
+SamplingPolicy ResizeAttributes::sampling_policy() const
+{
+    return _sampling_policy;
+}
+
+ResizeAttributes &ResizeAttributes::align_corners(bool align_corners)
+{
+    _align_corners = align_corners;
+    return *this;
+}
+
+bool ResizeAttributes::align_corners() const
+{
+    return _align_corners;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp b/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp
new file mode 100644
index 0000000000..5d4d666263
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+SoftmaxAttributes &SoftmaxAttributes::beta(float beta)
+{
+    _beta = beta;
+    return *this;
+}
+
+float SoftmaxAttributes::beta() const
+{
+    return _beta;
+}
+
+SoftmaxAttributes &SoftmaxAttributes::is_log_softmax(bool is_log_softmax)
+{
+    _is_log_softmax = is_log_softmax;
+    return *this;
+}
+
+bool SoftmaxAttributes::is_log_softmax() const
+{
+    return _is_log_softmax;
+}
+
+SoftmaxAttributes &SoftmaxAttributes::axis(int axis)
+{
+    _axis = axis;
+    return *this;
+}
+
+int SoftmaxAttributes::axis() const
+{
+    return _axis;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
new file mode 100644
index 0000000000..93881508bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+
+#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Services that are used throughout the creation phase of workload code
+ */
+class GpuComponentServices
+{
+public:
+    /** Default constructor */
+    GpuComponentServices() = default;
+    /** Get reference to component factory */
+    GpuKernelComponentFactory &component_factory()
+    {
+        return _comp_factory;
+    }
+
+private:
+    GpuKernelComponentFactory _comp_factory{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
new file mode 100644
index 0000000000..c923bf9c16
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
+
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Describe how the tensor runtime memory can be accessed
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorStorageType
+{
+    Unknown,
+    ClBufferUint8Ptr,
+    ClImage2dReadOnly,
+    ClImage2dWriteOnly,
+};
+
+/** Describe additional runtime information about the tensor
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorComponentType
+{
+    Unknown,
+    OffsetFirstElement,
+    Stride0,
+    Stride1,
+    Stride2,
+    Stride3,
+    Stride4,
+    Dim0,
+    Dim1,
+    Dim2,
+    Dim3,
+    Dim4,
+    Dim1xDim2,
+    Dim2xDim3,
+    Dim1xDim2xDim3,
+};
+
+/** Describe how to extract information from a runtime Gpu tensor, and set it as an argument to a gpu kernel at runtime
+ *
+ * A kernel argument is just an argument to the gpu kernel as shown in the argument list below. This contrasts with a "workload argument" which is a tensor (@ref GpuWorkloadArgument)
+ * void kernel(arg0, arg1, ... argN)
+ *
+ * In a kernel generated using dynamic fusion (@ref GpuKernelSourceCode), every kernel argument describes part of a tensor.
+ * A tensor is described as: **storages** followed by **components**
+ *
+ * A storage (@ref TensorStorageType) describes how the tensor runtime memory can be accessed (e.g. via a global uint8 pointer to a CL buffer)
+ * A component (@ref TensorComponentType) describes additional runtime information about the tensor (e.g. the dimensions of the tensor)
+ *
+ * The arguments are arranged in the order of use in the generated kernel code:
+ *
+ *  arg0   , arg1      , arg2      ,                         ...,                         , argN
+ *  storage, component0, component1, ..., componentX, storage, component0, component1, ..., componentY
+ * |                   tensor0                       |                    tensor1                    |
+ *
+ * An example argument list:
+ *
+ * void kernel(
+ *  image2d_t       t0_image,               // TensorStorageType::ClImage2dReadOnly
+ *  uint8_t*        t0_ptr,                 // TensorStorageType::ClBufferUint8Ptr
+ *  uint            t0_dim0,                // TensorComponentType::Dim0
+ *  uint            t0_stride1,             // TensorComponentType::Stride1
+ *  image2d_t       t1_ptr,                 // TensorStorageType::ClImage2dReadOnly
+ *  uint            t1_dim1xdim2,           // TensorComponentType::Dim1xDim2
+ *  uint            t1_stride1,             // TensorComponentType::Stride1
+ *  uint            t1_stride2,             // TensorComponentType:Stride2
+ * )
+ *
+ */
+class GpuKernelArgumentBinding
+{
+public:
+    enum class Type : int32_t
+    {
+        TensorStorage,  /** @ref TensorStorageType */
+        TensorComponent /** @ref TensorComponentType */
+    };
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
+        : _type{Type::TensorStorage}, _id{id}, _value{}
+    {
+        _value.tensor_storage_type = storage;
+    }
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
+        : _type{Type::TensorComponent}, _id{id}, _value{}
+    {
+        _value.tensor_component_type = component;
+    }
+    /** Storage type of the tensor
+     */
+    TensorStorageType tensor_storage_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorStorage);
+        return _value.tensor_storage_type;
+    }
+    /** Component of the tensor
+     */
+    TensorComponentType tensor_component_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorComponent);
+        return _value.tensor_component_type;
+    }
+    /** Id of the tensor this kernel argument belongs to
+     */
+    ITensorInfo::Id id() const
+    {
+        return _id;
+    }
+    /** Type of the kernel argument
+     */
+    Type type() const
+    {
+        return _type;
+    }
+
+private:
+    Type            _type;
+    ITensorInfo::Id _id;
+    union Value
+    {
+        TensorStorageType   tensor_storage_type;
+        TensorComponentType tensor_component_type;
+    };
+    Value _value;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
new file mode 100644
index 0000000000..1a458c9862
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGraph.h"
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+std::vector<DependencyGraph::TensorId>
+GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
+    return tensor_ids;
+}
+
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
+    : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{}
+{
+}
+
+GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
+{
+    GpuKernelComponentStream stream{_context, _services, mem_map};
+    const auto               op_seq = _dependency_graph.build_operators_sequence();
+
+    stream.new_component_group();
+    for (auto op : op_seq)
+    {
+        const auto component = _components.at(op.op).get();
+        const auto success   = stream.add_component(component);
+        if (!success) // Assume first failure was because the root component is unfusable
+        {
+            stream.new_component_group();
+            const auto success = stream.add_component(component);
+            ARM_COMPUTE_ERROR_ON(!success);
+            ARM_COMPUTE_UNUSED(success);
+        }
+    }
+
+    return stream;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
new file mode 100644
index 0000000000..6f871a3c90
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components
+ * Its main purposes are:
+ *  - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators)
+ *  - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology
+ */
+class GpuKernelComponentGraph
+{
+public:
+    /** Constructor
+     *
+     * @param[in] context  @ref GpuWorkloadContext to be used by the graph
+     * @param[in] services @ref GpuComponentServices to be used by the graph
+     */
+    GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services);
+    /** Prevent instances of this class from being copy constructed */
+    GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
+    /** Prevent instances of this class from being copied */
+    GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default;
+    /** Create a new component and add it to the component graph
+     * Component id is automatically allocated
+     *
+     * @tparam T    Component type
+     * @tparam Args Component argument types
+     *
+     * @param[in] args Component arguments except for component id, which is auto-allocated
+     */
+    template <typename T, typename... Args>
+    void add_new_component(Args &&...args)
+    {
+        auto                      comp    = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors = comp->tensors();
+        const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
+        const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
+        bool                      success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        ARM_COMPUTE_UNUSED(success);
+        ARM_COMPUTE_ERROR_ON(!success);
+        _components[comp->id()] = std::move(comp);
+        for (auto t : tensors.get_const_src_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+        for (auto t : tensors.get_const_dst_tensors())
+        {
+            _tensors[t->id()] = t;
+        }
+    }
+    /** Perform component fusion and serialize the graph into a stream of component groups
+     *
+     * @param[in] mem_map MemoryDescriptorMap for all the tensors in the component graph
+     *
+     * @return GpuKernelComponentStream
+     */
+    GpuKernelComponentStream fuse(const MemoryDescriptorMap &mem_map) const;
+
+private:
+    static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+    GpuWorkloadContext                           *_context;
+    GpuComponentServices                         *_services;
+    std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
+    std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
+    DependencyGraph                                             _dependency_graph{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
new file mode 100644
index 0000000000..5a6d125d96
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGroup.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool GpuKernelComponentGroup::add_component(ComponentPtr component)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered.");
+
+    // note: Constraint 1 is guaranteed as a precondition
+    // Constraint 2
+    if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable + Output)
+    if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable &&
+        component->type() != GpuComponentType::Output)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if (!_components.empty() &&
+        (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    {
+        return false;
+    }
+    // Constraint 4
+    if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if (!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    {
+        return false;
+    }
+    // Constraint 7
+    if (!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = component->tensors().get_const_dst_tensors();
+        for (const auto &t : root_dst_tensors)
+        {
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for (const auto &t : dst_tensors)
+        {
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 8
+    if (!_components.empty())
+    {
+        const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = component->tensors().get_const_dst_tensors();
+        for (const auto &t : root_dst_tensors)
+        {
+            if (t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for (const auto &t : dst_tensors)
+        {
+            if (t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 9
+    if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    {
+        return false;
+    }
+    // Constraint 9 corollary
+    if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    {
+        return false;
+    }
+    _components.push_back(component);
+    return true;
+}
+
+void GpuKernelComponentGroup::finalize()
+{
+    if (_finalized)
+    {
+        return;
+    }
+
+    _finalized = true;
+
+    std::set<const ITensorInfo *>                                   output_tensors;
+    std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
+    std::map<const ITensorInfo *, int32_t>                          tile_usages;
+
+    for (auto component : _components)
+    {
+        const auto tensors     = component->tensors();
+        const auto src_tensors = tensors.get_const_src_tensors();
+        const auto dst_tensors = tensors.get_const_dst_tensors();
+
+        // Detect input, output and intermediate tensors.
+        for (auto tensor : src_tensors)
+        {
+            const auto output_tensors_it = output_tensors.find(tensor);
+
+            if (output_tensors_it != output_tensors.end())
+            {
+                // This tensor is the output of another operator.
+                // It must be marked as intermediate tensor.
+                output_tensors.erase(output_tensors_it);
+                _interm_tensors.insert(tensor);
+            }
+            else if (_interm_tensors.find(tensor) == _interm_tensors.end())
+            {
+                _input_tensors.insert(tensor);
+
+                tile_usages[tensor] = 0;
+                possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+            }
+        }
+
+        for (auto tensor : dst_tensors)
+        {
+            ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
+            ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
+            ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
+            output_tensors.insert(tensor);
+
+            tile_usages[tensor] = 0;
+            possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+        }
+
+        // Check if the output can overwrite the input tile.
+        const auto component_type = component->type();
+        if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+        {
+            ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
+
+            const auto  dst_tensor = dst_tensors[0];
+            const auto &dst_shape  = dst_tensor->tensor_shape();
+            const auto &dst_type   = dst_tensor->data_type();
+
+            tile_usages[dst_tensor] = 0;
+
+            for (auto src_tensor : src_tensors)
+            {
+                const auto &src_shape = src_tensor->tensor_shape();
+                const auto &src_type  = src_tensor->data_type();
+
+                if (src_shape == dst_shape && src_type == dst_type)
+                {
+                    const auto tile_usages_it = tile_usages.find(src_tensor);
+                    ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
+
+                    if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+                    {
+                        // Increase the number of tile usages unless this component is an output
+                        // and the tile has not been shared with any component.
+                        // (Reason: output component doesn't change the content of the tile)
+                        ++tile_usages_it->second;
+                    }
+
+                    possible_tile_map[dst_tensor].push_back(src_tensor);
+                }
+            }
+        }
+        else
+        {
+            // Outputs of complex and unfusable components need dedicated tile.
+            for (auto tensor : dst_tensors)
+            {
+                tile_usages[tensor] = 0;
+            }
+        }
+    }
+
+    // Find the smallest list of tiles that the intermediate tensors need to write to.
+    for (auto tensor : _input_tensors)
+    {
+        _tile_map[tensor] = tensor;
+    }
+
+    for (auto component : _components)
+    {
+        const auto dst_tensors = component->tensors().get_const_dst_tensors();
+
+        for (auto tensor : dst_tensors)
+        {
+            const auto target_tiles = possible_tile_map.at(tensor);
+            _tile_map[tensor]       = tensor;
+
+            for (auto target : target_tiles)
+            {
+                const auto num_usage = tile_usages[target];
+
+                if (num_usage <= 1)
+                {
+                    // The target tile is consumed by only this operator, so we can reuse it
+                    // for the destination tensor data.
+                    _tile_map[tensor] = _tile_map.at(target);
+                    break;
+                }
+            }
+        }
+    }
+
+    for (auto tensor : output_tensors)
+    {
+        _tile_map[tensor] = tensor;
+    }
+
+    // All intermediate tensors that cannot be shared with any previous tensor
+    // will need to be declared as tile variable.
+    for (auto tensor_tile : _tile_map)
+    {
+        if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+        {
+            _tiles.push_back(tensor_tile.first);
+        }
+    }
+
+    std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(),
+                   std::back_inserter(_argument_tensors));
+    _any_output_tensor = *output_tensors.begin();
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _tiles;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+
+    if (_tile_map.find(tensor) != _tile_map.end())
+    {
+        return _tile_map.at(tensor);
+    }
+
+    return tensor;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _any_output_tensor;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _argument_tensors;
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
+{
+    if (empty())
+    {
+        return nullptr;
+    }
+    return _components[0];
+}
+
+bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _interm_tensors.find(tensor) != _interm_tensors.end();
+}
+
+bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+    return _input_tensors.find(tensor) != _input_tensors.end();
+}
+
+size_t GpuKernelComponentGroup::size() const
+{
+    return _components.size();
+}
+bool GpuKernelComponentGroup::empty() const
+{
+    return _components.empty();
+}
+GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
+{
+    return _components[index];
+}
+const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
+{
+    return _components[index];
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
+{
+    return _components.begin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
+{
+    return _components.end();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
+{
+    return _components.cend();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
+{
+    return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
+{
+    return _components.cend();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
new file mode 100644
index 0000000000..6ad71abb39
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+
+#include "components/Types.h"
+#include <cstdint>
+#include <cstdlib>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+/** A group of gpu kernel components to be fused together
+ * PRECONDITIONS:
+ * 1. Fusion is limited to a linear sequence of kernel components
+ * INVARIANTS:
+ * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components
+ * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components (
+ *        excluding any output or input (if any) components.
+ *        The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group
+ *    )
+ * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output?
+ * 4. All components but unfusable, have exactly 1 dst tensor
+ * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. )
+ * 6. All fused components share the same tunable parameters like tile size
+ * 7. All fused components share the same dst tensor shape
+ * 8. All fused components' tensors share the same @ref DataLayout
+ * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors
+ *      This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors
+ */
+class GpuKernelComponentGroup
+{
+public:
+    using ComponentPtr = IGpuKernelComponent *;
+    /** Maximum number of components that can be fused into the same component group
+     */
+    static constexpr size_t max_fused_components = 64;
+    /** Maximum number of dst tensors allowed for a component / component
+     */
+    static constexpr size_t max_dst_tensors = 8;
+
+public:
+    /** Default constructor */
+    GpuKernelComponentGroup() = default;
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default;
+    /** Add a component pointer into the group
+     * If the operation fails, then no change is made to the group
+     *
+     * @param[in] component Pointer to the component to be added
+     *
+     * @return true      If the operation is successful
+     * @return false     If the operation fails
+     */
+    bool add_component(ComponentPtr component);
+    /** Optimize and pre-compute information about the component group */
+    void finalize();
+    /** Get one of the destination tensors of this group */
+    const ITensorInfo *get_any_dst_tensor() const;
+    /** Get tensor argument of this group
+     *  A tensor is an argument if it is a source or destination tensor to the group
+     */
+    std::vector<const ITensorInfo *> get_argument_tensors() const;
+    /** Get the root (first) component of this group */
+    ComponentPtr get_root_component() const;
+    /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
+     *
+     * An intermediate tensor is any tensor that is not an argument.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up
+     *
+     * @return true  If @p tensor is an intermediate tensor
+     * @return false  Otherwise
+     */
+    bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+    /** Check if an @ref ITensorInfo is an input tensor of the group.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up.
+     *
+     * @return true if @p tensor is an input tensor of the group, otherwise false.
+     */
+    bool is_input_tensor(const ITensorInfo *tensor) const;
+    /** Get the list of temporary tiles that need to be declared */
+    std::vector<const ITensorInfo *> get_tiles() const;
+    /** Get the shared tile that can be used to store temporary data of the specified tensor.
+     *
+     * @param[in] tensor @ref ITensorInfo to be looked up.
+     *
+     * @return @ref ITensorInfo that is used to store temporary data of @p tensor.
+     **/
+    const ITensorInfo *get_tile_for_tensor(const ITensorInfo *tensor) const;
+    /** Get the number of components within the group */
+    size_t size() const;
+    /** Check if the component group is empty */
+    bool                                               empty() const;
+    ComponentPtr                                      &operator[](size_t index);
+    const ComponentPtr                                &operator[](size_t index) const;
+    typename std::vector<ComponentPtr>::iterator       begin();
+    typename std::vector<ComponentPtr>::iterator       end();
+    typename std::vector<ComponentPtr>::const_iterator begin() const;
+    typename std::vector<ComponentPtr>::const_iterator end() const;
+    typename std::vector<ComponentPtr>::const_iterator cbegin() const;
+    typename std::vector<ComponentPtr>::const_iterator cend() const;
+
+private:
+    std::vector<ComponentPtr> _components{};
+
+    bool _finalized{false};
+
+    std::vector<const ITensorInfo *>                   _argument_tensors{};
+    std::set<const ITensorInfo *>                      _input_tensors{};
+    std::set<const ITensorInfo *>                      _interm_tensors{};
+    const ITensorInfo                                 *_any_output_tensor{nullptr};
+    std::vector<const ITensorInfo *>                   _tiles{};
+    std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
new file mode 100644
index 0000000000..8042e3dd08
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentStream.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext        *context,
+                                                   GpuComponentServices      *services,
+                                                   const MemoryDescriptorMap &mem_map)
+    : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map}
+{
+}
+
+GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
+{
+    GpuWorkloadSourceCode source_code;
+    // Traverse through component groups and assemble workload together
+    for (auto &&group : _component_groups)
+    {
+        group.finalize();
+
+        // Write kernel code
+        GpuLogicalKernel          logical_kernel(_services, group);
+        const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
+        // The whole unit workload stage is determined by the root component
+        const auto unit_workload_stage = group.get_root_component()->properties().stage();
+        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map, _context);
+    }
+    return source_code;
+}
+
+void GpuKernelComponentStream::new_component_group()
+{
+    _component_groups.emplace_back();
+}
+
+bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component)
+{
+    ARM_COMPUTE_ERROR_ON(_component_groups.empty());
+    return _component_groups.back().add_component(component);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
new file mode 100644
index 0000000000..ef8a8a15b0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph
+ *  Each component group in the stream denotes a complete kernel that may consist of multiple components
+ *
+ * The main purposes of this class are:
+ *  - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream
+ *  - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode
+ */
+class GpuKernelComponentStream
+{
+public:
+    /** Constructor
+     *
+     * @param[in] context  @ref GpuWorkloadContext to be used throughout the stream
+     * @param[in] services @ref GpuComponentServices to be used throughout the stream
+     * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
+     */
+    GpuKernelComponentStream(GpuWorkloadContext        *context,
+                             GpuComponentServices      *services,
+                             const MemoryDescriptorMap &mem_map);
+    /** Allow instances of this class to be copy constructed */
+    GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be copied */
+    GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default;
+    /** Allow instances of this class to be moved */
+    GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default;
+    /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */
+    GpuWorkloadSourceCode write_workload_code();
+    /** Insert a new component group in the stream.
+     * Subsequent components are added to this group until end of stream or the next new_component_group is called
+     */
+    void new_component_group();
+    /** Add a component to the previously created component group
+     *  Throw an error if no component group is present in the stream
+     *
+     * @param[in] component Component to be inserted
+     *
+     * @return true      If the operation is successful
+     * @return false     Otherwise
+     */
+    bool add_component(IGpuKernelComponent *component);
+
+private:
+    GpuWorkloadContext                  *_context;
+    GpuComponentServices                *_services;
+    std::vector<GpuKernelComponentGroup> _component_groups{};
+    MemoryDescriptorMap                  _mem_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
new file mode 100644
index 0000000000..11d916eec9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <deque>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** The argument list of a @ref GpuKernelSourceCode */
+using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>;
+
+/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
+ */
+class GpuKernelSourceCode
+{
+public:
+    /** Set kernel name */
+    GpuKernelSourceCode &name(const std::string &n)
+    {
+        _name = n;
+        return *this;
+    }
+    /** Set kernel code */
+    GpuKernelSourceCode &code(const std::string &c)
+    {
+        _code = c;
+        return *this;
+    }
+    /** Set kernel config id string */
+    GpuKernelSourceCode &config_id(const std::string &c_id)
+    {
+        _config_id = c_id;
+        return *this;
+    }
+    /** Set kernel build options */
+    GpuKernelSourceCode &build_options(const CLBuildOptions &b_options)
+    {
+        _build_options = b_options;
+        return *this;
+    }
+    /** Set kernel execution window */
+    GpuKernelSourceCode &window(const Window &window)
+    {
+        _window = window;
+        return *this;
+    }
+    /** Set kernel argument list */
+    GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments)
+    {
+        _arguments = arguments;
+        return *this;
+    }
+    /** Get kernel name */
+    std::string name() const
+    {
+        return _name;
+    }
+    /** Get kernel code */
+    std::string code() const
+    {
+        return _code;
+    }
+    /** Get kernel config id string */
+    std::string config_id() const
+    {
+        return _config_id;
+    }
+    /** Get kernel build options */
+    const CLBuildOptions &build_options() const
+    {
+        return _build_options;
+    }
+    /** Get kernel execution window */
+    const Window &window() const
+    {
+        return _window;
+    }
+    /** Get kernel argument list */
+    const GpuKernelArgumentList &arguments() const
+    {
+        return _arguments;
+    }
+
+private:
+    std::string           _name{};
+    std::string           _code{};
+    std::string           _config_id{};
+    CLBuildOptions        _build_options{};
+    Window                _window{};
+    GpuKernelArgumentList _arguments{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
new file mode 100644
index 0000000000..725a46e91c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuLogicalKernel.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components) // NOLINT
+    : _comp_group{std::move(components)}, _store_components{}
+{
+    ARM_COMPUTE_UNUSED(services);
+}
+
+GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
+{
+    GpuKernelSourceCode code;
+    GpuCkwDriver        writer{_comp_group};
+
+    code.name(writer.get_name());
+    code.code(writer.get_code());
+    code.arguments(writer.get_kernel_arguments());
+    code.build_options(writer.get_build_options());
+    code.config_id(writer.get_config_id());
+    code.window(writer.get_window());
+
+    return code;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
new file mode 100644
index 0000000000..e2bc83b286
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A wrapper-processor of a @ref GpuKernelComponentGroup
+ * It adds the load (if any) and store components to the component group
+ * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code
+ */
+class GpuLogicalKernel
+{
+public:
+    /** Constructor
+     *
+     * @param[in] services   @ref GpuComponentServices to be used
+     * @param[in] components Component group from which this logical kernel is initialized
+     */
+    explicit GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components); // NOLINT
+    /** Allow instances of this class to be copy constructed */
+    GpuLogicalKernel(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be copied */
+    GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default;
+    /** Allow instances of this class to be move constructed */
+    GpuLogicalKernel(GpuLogicalKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default;
+    /** Generate a @ref GpuKernelSourceCode */
+    GpuKernelSourceCode write_kernel_code();
+
+private:
+    GpuKernelComponentGroup                           _comp_group{};
+    std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
new file mode 100644
index 0000000000..aec8b9db4f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+    std::vector<DependencyGraph::TensorId> tensor_ids{};
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
+    return tensor_ids;
+}
+
+} // namespace
+
+Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
+    : _id{id}, _operator_type{operator_type}, _tensors{tensors}
+{
+}
+
+OperatorId Operator::id() const
+{
+    return _id;
+}
+
+GpuOperatorType Operator::operator_type() const
+{
+    return _operator_type;
+}
+
+ArgumentPack<ITensorInfo> Operator::tensors() const
+{
+    return _tensors;
+}
+
+bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const
+{
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    // Constraint 1
+    if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
+    {
+        return false;
+    }
+    // Constraint 2
+    if (_operators.size() >= max_fused_operators)
+    {
+        return false;
+    }
+    // Constraint 3.1: Pattern: (Unfusable)
+    if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    {
+        return false;
+    }
+    // Constraint 3.2
+    if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    {
+        return false;
+    }
+    // Constraint 4
+    if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    {
+        return false;
+    }
+    // Constraint 5
+    if (_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor = root_dst_tensors[0];
+        const auto dst_tensors      = op.tensors().get_const_dst_tensors();
+        for (const auto &t : root_dst_tensors)
+        {
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+        for (const auto &t : dst_tensors)
+        {
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            {
+                return false;
+            }
+        }
+    }
+    // Constraint 6
+    if (_operators.size() > 0)
+    {
+        const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+        ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+        const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+        const auto dst_tensors             = op.tensors().get_const_dst_tensors();
+        for (const auto &t : root_dst_tensors)
+        {
+            if (t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+        for (const auto &t : dst_tensors)
+        {
+            if (t->data_layout() != first_dst_tensor_layout)
+            {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+void GpuOperatorGroup::add_operator(const Operator &op, bool is_output)
+{
+    ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output));
+    const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+    const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+    _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
+    _operators[op.id()] = op;
+}
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType           &operator_type,
+                                        const ArgumentPack<ITensorInfo> &tensors) const
+{
+    auto new_id = static_cast<OperatorId>(_operators.size());
+    return Operator{new_id, operator_type, tensors};
+}
+const Operator *GpuOperatorGroup::get_root_operator() const
+{
+    const auto roots = _graph.get_root_ops();
+    ARM_COMPUTE_ERROR_ON(roots.size() > 1);
+    if (roots.empty())
+    {
+        return nullptr;
+    }
+    return &_operators.at(roots[0]);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
new file mode 100644
index 0000000000..0a2369d357
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using OperatorId = DependencyGraph::OperatorId;
+
+/** An operator for the sole purpose of validating fusion
+ */
+class Operator
+{
+public:
+    /** Default constructor */
+    Operator() = default;
+    /** Get Operator Id */
+    OperatorId id() const;
+    /** Get operator type */
+    GpuOperatorType operator_type() const;
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const;
+    friend class GpuOperatorGroup;
+
+private:
+    Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors);
+    OperatorId                _id{};
+    GpuOperatorType           _operator_type{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+
+/** A linear sequence of operators to be fused in a workload
+ *  For the time being, this class is only used for validating operator fusion
+ * INVARIANTS:
+ * @note These invariants are exactly the same as operator fusion constraints
+ * 1. Fusion is limited to a linear sequence of operators
+ * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators
+ * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable
+ * 4. All operator but unfusable, have exactly 1 dst tensor
+ * 5. All fused operators share the same dst tensor shape
+ * 6. All fused operators' tensors share the same @ref DataLayout
+ */
+class GpuOperatorGroup
+{
+public:
+    static constexpr size_t max_fused_operators = 32;
+    /** Try adding (without actually adding) an operator to the group
+     *
+     * @param[in] op        Operator to be added
+     * @param[in] is_output Whether this operator is the output operator.
+     *
+     * @return true   If @p op can be added while maintaining the invariants
+     * @return false  Otherwise
+     */
+    bool try_add_operator(const Operator &op, bool is_output = false) const;
+    /** Add an operator to the group
+     *
+     * @param[in] op        Operator to be added
+     * @param[in] is_output Whether this operator is the output operator.
+     */
+    void add_operator(const Operator &op, bool is_output = false);
+    /** Create a new operator
+     *
+     * @param[in] operator_type @ref GpuOperatorType of the new operator
+     * @param[in] tensors       Tensor arguments to the new operator
+     *
+     * @return Operator
+     */
+    Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const;
+    /** Get the "root operator" of the group, which is the first operator in a linear sequence
+     * @return const Operator* Pointer to the root operator
+     */
+    const Operator *get_root_operator() const;
+
+private:
+    DependencyGraph                _graph{};
+    std::map<OperatorId, Operator> _operators{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
new file mode 100644
index 0000000000..c77697c343
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain properties common to all operator types */
+
+/** Operator type in the context of fusion
+ */
+enum class GpuOperatorType
+{
+    /** Simple operators are operators that:
+     *  1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise
+     *  2. Have exactly 1 output
+     */
+    Simple,
+    /** Complex operators are operators that are not simple but are still fusable with simple ones
+     */
+    Complex,
+    /** Unfusable operators are operators that cannot be fused with any other types of operators
+     */
+    Unfusable
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
new file mode 100644
index 0000000000..fab18aabb4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
+    : _impl{std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx)}
+{
+}
+
+GpuWorkloadContext::~GpuWorkloadContext() = default;
+
+GpuWorkloadContext::GpuWorkloadContext(GpuWorkloadContext &&other) = default;
+
+GpuWorkloadContext &GpuWorkloadContext::operator=(GpuWorkloadContext &&other) = default;
+
+GpuTarget GpuWorkloadContext::gpu_target() const
+{
+    return _impl->cl_compile_context()->get_gpu_target();
+}
+
+GpuLanguage GpuWorkloadContext::gpu_language() const
+{
+    return _impl->gpu_language();
+}
+
+const CLCompileContext *GpuWorkloadContext::cl_compile_context() const
+{
+    return _impl->cl_compile_context();
+}
+
+void GpuWorkloadContext::register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info)
+{
+    _impl->register_user_tensor(std::move(tensor_info));
+}
+
+GpuWorkloadContext::Impl &GpuWorkloadContext::implementation()
+{
+    return *_impl;
+}
+
+const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const
+{
+    return *_impl;
+}
+
+GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
+    : _gpu_language(gpu_language),
+      _cl_compile_ctx(cl_compile_ctx),
+      _next_tensor_id(1),
+      _mem_map(),
+      _managed_tensor_info()
+{
+}
+
+GpuLanguage GpuWorkloadContext::Impl::gpu_language() const
+{
+    return _gpu_language;
+}
+
+const CLCompileContext *GpuWorkloadContext::Impl::cl_compile_context() const
+{
+    return _cl_compile_ctx;
+}
+
+const MemoryDescriptorMap &GpuWorkloadContext::Impl::mem_map() const
+{
+    return _mem_map;
+}
+
+void GpuWorkloadContext::Impl::register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info)
+{
+    ARM_COMPUTE_ERROR_ON(tensor_info->has_valid_id());
+
+    const auto tensor_id = next_tensor_id();
+
+    tensor_info->set_id(tensor_id);
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User};
+    // Save a *copy* of the user tensor info in workload context for future reference
+    // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
+    _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor()
+{
+    auto       tensor_info = std::make_unique<TensorInfo>();
+    const auto tensor_id   = -next_tensor_id();
+    tensor_info->set_id(tensor_id);
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual};
+    auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+    return inserted.first->second.get();
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo &itensor_info)
+{
+    auto       tensor_info = std::make_unique<TensorInfo>(itensor_info);
+    const auto tensor_id   = next_tensor_id();
+    tensor_info->set_id(tensor_id);
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}};
+    auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+    return inserted.first->second.get();
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id)
+{
+    return _managed_tensor_info.at(id).get();
+}
+
+const ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id) const
+{
+    return _managed_tensor_info.at(id).get();
+}
+
+ITensorInfo::Id GpuWorkloadContext::Impl::next_tensor_id()
+{
+    return _next_tensor_id++;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
new file mode 100644
index 0000000000..b3571a6480
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of workload context. */
+class GpuWorkloadContext::Impl
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu_language   Target GPU language.
+     * @param[in] cl_compile_ctx CL compile context.
+     */
+    Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx);
+
+    /** Copy constructor */
+    Impl(Impl &) = default;
+
+    /** Assignment */
+    Impl &operator=(Impl &) = default;
+
+    /** Get target GPU language. */
+    GpuLanguage gpu_language() const;
+
+    /** Get CL compile context. */
+    const CLCompileContext *cl_compile_context() const;
+
+    /** Get memory descriptor registry. */
+    const MemoryDescriptorMap &mem_map() const;
+
+    /** Set a new ID and register the user tensor info.
+     *
+     * The ownership of the tensor info object will be transfered to this context object.
+     *
+     * @param[in] tensor_info The tensor info to be registered.
+     */
+    void register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info);
+
+    /** Create a virtual (see @ref MemoryType) tensor info and save it
+     *
+     * @return ITensorInfo*  The created virtual tensor info object pointer
+     */
+    ITensorInfo *create_virtual_tensor();
+    /** Create an auxiliary (see @ref MemoryType) tensor info and save it
+     *
+     * @param[in] tensor_info @ref ITensorInfo to copy from
+     *
+     * @return ITensorInfo*  The created auxiliary tensor info object pointer
+     */
+    ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info);
+
+    /** Get tensor info created by this context, from id */
+    ITensorInfo *get_tensor_info(ITensorInfo::Id id);
+
+    /** Get tensor info created by this context, from id */
+    const ITensorInfo *get_tensor_info(ITensorInfo::Id id) const;
+
+private:
+    ITensorInfo::Id next_tensor_id();
+
+    GpuLanguage       _gpu_language;
+    CLCompileContext *_cl_compile_ctx;
+
+    ITensorInfo::Id                                        _next_tensor_id;
+    MemoryDescriptorMap                                    _mem_map;
+    std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
new file mode 100644
index 0000000000..357cb48a84
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique<Implementation>(context)}
+{
+}
+
+GpuWorkloadSketch::~GpuWorkloadSketch()
+{
+}
+
+GpuWorkloadSketch::GpuWorkloadSketch(GpuWorkloadSketch &&) = default;
+
+GpuWorkloadSketch &GpuWorkloadSketch::operator=(GpuWorkloadSketch &&) = default;
+
+const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const
+{
+    return _impl->context();
+}
+
+GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context()
+{
+    return _impl->context();
+}
+
+GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation()
+{
+    return *_impl;
+}
+
+const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const
+{
+    return *_impl;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
new file mode 100644
index 0000000000..04e294eacc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of @ref GpuWorkloadSketch */
+class GpuWorkloadSketch::Implementation
+{
+public:
+    /** Constructor
+     *
+     * @param[in] context global workload creation context
+     */
+    explicit Implementation(Context *context)
+        : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{}
+    {
+    }
+    /** Prevent instances of this class from being copy constructed */
+    Implementation(const Implementation &impl) = delete;
+    /** Prevent instances of this class from being copied */
+    Implementation &operator=(const Implementation &impl) = delete;
+    /** Allow instances of this class to be move constructed */
+    Implementation(Implementation &&impl) = default;
+    /** Allow instances of this class to be moved */
+    Implementation &operator=(Implementation &&impl) = default;
+    /** Get workload context */
+    const Context *context() const
+    {
+        return _context;
+    }
+    /** Get workload context */
+    Context *context()
+    {
+        return _context;
+    }
+    /** Get component graph */
+    const GpuKernelComponentGraph &component_graph() const
+    {
+        return _component_graph;
+    }
+    /** Get component graph */
+    GpuKernelComponentGraph &component_graph()
+    {
+        return _component_graph;
+    }
+    /** Get operator group */
+    const GpuOperatorGroup &operator_group() const
+    {
+        return _operator_group;
+    }
+    /** Get operator group */
+    GpuOperatorGroup &operator_group()
+    {
+        return _operator_group;
+    }
+    /** Generate @ref GpuWorkloadSourceCode from the workload sketch
+     * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
+     *
+     * @return GpuWorkloadSourceCode  The generated workload code
+     */
+    GpuWorkloadSourceCode generate_source_code() const
+    {
+        const auto mem_map = _context->implementation().mem_map();
+        return component_graph().fuse(mem_map).write_workload_code();
+    }
+    /** Create a virtual (see @ref MemoryType) tensor info and save it
+     *
+     * @return ITensorInfo*  The created virtual tensor info object pointer
+     */
+    ITensorInfo *create_virtual_tensor()
+    {
+        return _context->implementation().create_virtual_tensor();
+    }
+    /** Create an auxiliary (see @ref MemoryType) tensor info and save it
+     *
+     * @param[in] tensor_info @ref ITensorInfo to copy from
+     *
+     * @return ITensorInfo*  The created auxiliary tensor info object pointer
+     */
+    ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info)
+    {
+        return _context->implementation().create_auxiliary_tensor(tensor_info);
+    }
+
+    ITensorInfo *get_tensor_info(ITensorInfo::Id id)
+    {
+        return _context->implementation().get_tensor_info(id);
+    }
+
+private:
+    Context                *_context;
+    GpuComponentServices    _comp_services;
+    GpuKernelComponentGraph _component_graph;
+    GpuOperatorGroup        _operator_group;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
new file mode 100644
index 0000000000..5d75bcaaa0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Extract kernel arguments of one tensor from a flat list of kernel arguments.
+ *
+ * @param[in] flat_kernel_args
+ * @return GpuKernelArgumentList
+ */
+GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
+{
+    if (flat_kernel_args.empty())
+    {
+        return {};
+    }
+    GpuKernelArgumentList tensor_kargs{};
+
+    const GpuKernelArgumentBinding &karg_head = flat_kernel_args.front();
+    tensor_kargs.push_back(karg_head);
+    flat_kernel_args.pop_front();
+    const auto tensor_id = karg_head.id();
+
+    while (!flat_kernel_args.empty())
+    {
+        const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
+        if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+        {
+            return tensor_kargs;
+        }
+        tensor_kargs.push_back(karg);
+        flat_kernel_args.pop_front();
+    }
+    return tensor_kargs;
+}
+} // namespace
+/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
+using UnitWorkloadId = int32_t;
+
+/** Describes all the info related to a **workload argument** (tensor) in order to:
+ *  - be used by runtime to configure gpu kernel argument
+ *  - be used by memory managers to allocate required memory
+ */
+class GpuWorkloadArgument
+{
+public:
+    /** Default constructor */
+    GpuWorkloadArgument() = default;
+    /** Constructor
+     *
+     * @param[in] tensor_info @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc    @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_args @ref GpuKernelArgumentList of the workload argument
+     */
+    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
+                        const MemoryDescriptor      &mem_desc,
+                        const GpuKernelArgumentList &kernel_args)
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
+    {
+    }
+    /** Get tensor id within workload */
+    ITensorInfo::Id id() const
+    {
+        return _tensor_info.id();
+    }
+    /** Get @ref ITensorInfo of the argument */
+    ITensorInfo *tensor_info()
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref ITensorInfo of the argument */
+    const ITensorInfo *tensor_info() const
+    {
+        return &_tensor_info;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    MemoryDescriptor *memory_descriptor()
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref MemoryDescriptor of the argument */
+    const MemoryDescriptor *memory_descriptor() const
+    {
+        return &_mem_desc;
+    }
+    /** Get @ref GpuKernelArgumentList of the workload tensor */
+    GpuKernelArgumentList *kernel_argument_list()
+    {
+        return &_kernel_args;
+    }
+    /** Get @ref GpuKernelArgumentList of the workload tensor */
+    const GpuKernelArgumentList *kernel_argument_list() const
+    {
+        return &_kernel_args;
+    }
+    /** Check if the workload argument has valid id
+     *
+     * @return true   If has valid id
+     * @return false  Otherwise
+     */
+    bool has_valid_id() const
+    {
+        return _tensor_info.has_valid_id();
+    }
+
+private:
+    TensorInfo            _tensor_info{};
+    MemoryDescriptor      _mem_desc{};
+    GpuKernelArgumentList _kernel_args{};
+};
+
+/** Describes when a unit workload is run.
+ */
+struct UnitWorkloadStage
+{
+    enum class Stage
+    {
+        Prepare, /**< Only run once at the beginning. */
+        Run,     /**< Run every time after the first time. */
+    };
+    Stage stage{Stage::Run};
+};
+
+inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+{
+    return stage0.stage == stage1.stage;
+}
+
+/** The atomic unit in a Gpu workload. It contains exactly one kernel to run.
+ */
+class GpuUnitWorkload
+{
+public:
+    /** Default constructor */
+    GpuUnitWorkload() = default;
+    /** Constructor
+     *
+     * @param[in] id          Id that uniquely identifies this unit workload in a workload
+     * @param[in] kernel_code @ref GpuKernelSourceCode contained within
+     * @param[in] stage       Stage of the unit workload
+     */
+    GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
+        : _id{id}, _kernel_code{kernel_code}, _stage{stage}
+    {
+    }
+    /** Get the id of the unit workload */
+    UnitWorkloadId id() const
+    {
+        return _id;
+    }
+    /** Get reference to the underlying @ref GpuKernelSourceCode */
+    const GpuKernelSourceCode &code() const
+    {
+        return _kernel_code;
+    }
+    /** Get the stage of the unit workload */
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadId      _id{};
+    GpuKernelSourceCode _kernel_code{};
+    UnitWorkloadStage   _stage{};
+};
+
+/** Hold the generated kernel source code and other information required to compile and run the workload.
+ */
+class GpuWorkloadSourceCode
+{
+public:
+    /** Default constructor */
+    GpuWorkloadSourceCode() = default;
+    /** Add a unit workload to the workload code
+     *
+     * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
+     * @param[in] stage       Stage of the unit workload
+     * @param[in] mem_map     @ref MemoryDescriptor map for all tensors within the unit workload
+     * @param[in] context     @ref GpuWorkloadContext associated with the unit workload
+     *
+     * @return UnitWorkloadId  Allocated unit workload id
+     */
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code,
+                                     const UnitWorkloadStage   &stage,
+                                     const MemoryDescriptorMap &mem_map,
+                                     const GpuWorkloadContext  *context)
+    {
+        // Use the size of the kernel codes as Id
+        const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
+        const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
+        _unit_workloads.push_back(unit_work);
+
+        GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
+        GpuKernelArgumentList tensor_kargs{};
+        while (true)
+        {
+            tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
+            if (tensor_kargs.empty())
+            {
+                break;
+            }
+            else
+            {
+                const auto tensor_id           = tensor_kargs.at(0).id();
+                _workload_arguments[tensor_id] = GpuWorkloadArgument{
+                    *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs};
+                if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+                {
+                    _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
+                }
+                _tensor_uwork_map[tensor_id].insert(uwk_id);
+            }
+        }
+
+        return uwk_id;
+    }
+    /** Get a unit workload from its id */
+    const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const
+    {
+        ARM_COMPUTE_ERROR_ON(id < 0);
+        return _unit_workloads.at(id);
+    }
+    /** Get all unit workloads sorted in topological order */
+    std::vector<UnitWorkloadId> unit_workloads() const
+    {
+        std::vector<UnitWorkloadId> ids{};
+
+        for (const auto &uwk : _unit_workloads)
+        {
+            ids.push_back(uwk.id());
+        }
+        return ids;
+    }
+    /** Get a @ref GpuWorkloadArgument from its associated tensor id */
+    const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const
+    {
+        return &_workload_arguments.at(t_id);
+    }
+    /** Get all tensors in the entire workload */
+    std::vector<ITensorInfo::Id> tensors() const
+    {
+        std::vector<ITensorInfo::Id> ids{};
+        for (const auto &id_tensor : _workload_arguments)
+        {
+            ids.push_back(id_tensor.first);
+        }
+        return ids;
+    }
+    /** Get all unit workloads connected to the tensor with @p t_id */
+    std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const
+    {
+        const auto unit_work_set = _tensor_uwork_map.at(t_id);
+        return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end());
+    }
+
+private:
+    std::vector<GpuUnitWorkload>                        _unit_workloads{};
+    std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
+    std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
new file mode 100644
index 0000000000..84972501de
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** An interface that can write a gpu kernel
+ */
+class IGpuKernelWriter
+{
+public:
+    /** Destructor */
+    virtual ~IGpuKernelWriter()
+    {
+    }
+    /** Generate kernel name */
+    virtual std::string get_name() = 0;
+    /** Generate kernel code */
+    virtual std::string get_code() = 0;
+    /** Generate build options */
+    virtual CLBuildOptions get_build_options()
+    {
+        return {};
+    }
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    virtual std::string get_config_id() = 0;
+    /** Generate execution window */
+    virtual Window get_window() const = 0;
+    /** Get the flat list of arguments of the kernel*/
+    virtual GpuKernelArgumentList get_kernel_arguments()
+    {
+        return {};
+    }
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
new file mode 100644
index 0000000000..a42b39700c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
+#include "compute_kernel_writer/include/ckw/Error.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand tensor) : _tensor(tensor)
+{
+}
+
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand         &tile,
+                                                                      const ckw::TensorSampler &sampler)
+{
+    CKW_ASSERT(_tile == nullptr);
+
+    _tile    = tile;
+    _sampler = sampler;
+
+    return *this;
+}
+
+bool GpuCkwComponentArgument::has_tensor() const
+{
+    return _tensor.is_valid();
+}
+
+ckw::TensorOperand &GpuCkwComponentArgument::tensor()
+{
+    CKW_ASSERT(_tensor.is_valid());
+
+    return _tensor;
+}
+
+const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const
+{
+    CKW_ASSERT(_tensor.is_valid());
+
+    return _tensor;
+}
+
+bool GpuCkwComponentArgument::has_tile() const
+{
+    return _tile.is_valid();
+}
+
+ckw::TileOperand &GpuCkwComponentArgument::tile()
+{
+    CKW_ASSERT(_tile.is_valid());
+
+    return _tile;
+}
+
+const ckw::TileOperand &GpuCkwComponentArgument::tile() const
+{
+    CKW_ASSERT(_tile.is_valid());
+
+    return _tile;
+}
+
+ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler()
+{
+    CKW_ASSERT(_tile.is_valid());
+
+    return _sampler;
+}
+
+const ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() const
+{
+    CKW_ASSERT(_tile.is_valid());
+
+    return _sampler;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
new file mode 100644
index 0000000000..7a57c81e5f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+
+#include "compute_kernel_writer/include/ckw/TensorOperand.h"
+#include "compute_kernel_writer/include/ckw/TensorSampler.h"
+#include "compute_kernel_writer/include/ckw/TileOperand.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
+class GpuCkwComponentArgument
+{
+public:
+    /** Default constructor */
+    GpuCkwComponentArgument() = default;
+
+    /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor.
+     *
+     * @param[in] tensor The user tensor.
+     */
+    explicit GpuCkwComponentArgument(ckw::TensorOperand tensor);
+
+    /** Bind the tile and sampler to the tensor argument.
+     *
+     * This method can be used to share a tile and sampler associated to a tensor
+     * among different kernel components. For example, when we create the destination
+     * tile and destination sampler for the first time (root component), this method can be
+     * used to bind these two information to the destination tensor so that the following
+     * simple components know the tile size and how to access the elements from memory.
+     *
+     * @param[in] tile    The tile that has been loaded.
+     * @param[in] sampler The tensor sampling information that has been used to load the tile.
+     */
+    GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorSampler &sampler);
+
+    /** Get whether the argument is a user tensor. */
+    bool has_tensor() const;
+
+    /** Get the tensor operand.
+     *
+     * If the tensor is not available, throw an error.
+     */
+    ckw::TensorOperand &tensor();
+
+    /** Get the tensor operand.
+     *
+     * If the tensor is not available, throw an error.
+     */
+    const ckw::TensorOperand &tensor() const;
+
+    /** Get whether the argument contains a tile.
+     *
+     * The argument can be either a user tensor that has been loaded,
+     * or a virtual tensor (i.e. a tile with tensor sampling information).
+     */
+    bool has_tile() const;
+
+    /** Get the tile operand.
+     *
+     * If the tile is not available, throw an error.
+     */
+    ckw::TileOperand &tile();
+
+    /** Get the tile operand.
+     *
+     * If the tile is not available, throw an error.
+     */
+    const ckw::TileOperand &tile() const;
+
+    /** Get the tensor sampling information for the tile.
+     *
+     * If the tile is not available, throw an error.
+     */
+    ckw::TensorSampler &tensor_sampler();
+
+    /** Get the tensor sampling information for the tile.
+     *
+     * If the tile is not available, throw an error.
+     */
+    const ckw::TensorSampler &tensor_sampler() const;
+
+private:
+    ckw::TensorOperand _tensor{};
+    ckw::TileOperand   _tile{};
+    ckw::TensorSampler _sampler{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
new file mode 100644
index 0000000000..a0e5e16aa0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/TargetArchitecture.h"
+#include "compute_kernel_writer/include/ckw/types/TargetLanguage.h"
+
+using namespace ckw;
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
+{
+    _components = components;
+
+    // Generate kernel name
+    std::string kernel_name;
+    for (auto &comp : _components)
+    {
+        auto ckw_driver = comp->ckw_component_driver();
+        ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+        kernel_name += ckw_driver->get_name(_components) + "__";
+    }
+
+    // Generate kernel code
+    auto root_writer =
+        KernelWriter::create_instance(ckw::TargetArchitecture::GpuArmMaliValhall, ckw::TargetLanguage::OpenCL);
+    GpuCkwScopedKernelWriter writer(root_writer.get());
+    GpuCkwVariableTable      vtable{};
+
+    for (auto &comp : _components)
+    {
+        auto ckw_driver = comp->ckw_component_driver();
+        ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+        ckw_driver->write_component_code(_components, vtable, writer);
+    }
+    auto kernel = root_writer->emit_kernel(kernel_name);
+
+    // Set the kernel name, kernel arguments and source code
+    _kernel_name = kernel_name;
+    _kernel_args = kernel->arguments();
+    _kernel_code = kernel->source_code();
+}
+
+std::string GpuCkwDriver::get_name()
+{
+    return _kernel_name;
+}
+
+std::string GpuCkwDriver::get_code()
+{
+    return _kernel_code;
+}
+
+std::string GpuCkwDriver::get_config_id()
+{
+    std::string id;
+    for (auto &comp : _components)
+    {
+        auto ckw_driver = comp->ckw_component_driver();
+        ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+        id = ckw_driver->get_tuner_id(_components) + "__";
+    }
+    return id;
+}
+
+Window GpuCkwDriver::get_window() const
+{
+    const auto root_comp = _components.get_root_component();
+    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+    return root_comp->ckw_component_driver()->get_window();
+}
+
+GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
+{
+    GpuKernelArgumentList args{};
+    for (const auto &arg : _kernel_args)
+    {
+        switch (arg.type())
+        {
+            case KernelArgument::Type::TensorStorage:
+            {
+                args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_storage_type()));
+                break;
+            }
+            case KernelArgument::Type::TensorComponent:
+            {
+                args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_component_type()));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported KernelArgument Type");
+                break;
+            }
+        }
+    }
+    return args;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
new file mode 100644
index 0000000000..f8770920b7
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+
+#include "compute_kernel_writer/include/ckw/Kernel.h"
+#include "compute_kernel_writer/include/ckw/KernelArgument.h"
+#include <string>
+
+namespace arm_compute
+{
+/** Forward declarations */
+class Window;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use Kernel Writer to write kernel code
+ *  Used by dynamic_fusion module
+ */
+class GpuCkwDriver : public IGpuKernelWriter
+{
+public:
+    /** Default constructor */
+    GpuCkwDriver() = delete;
+    /** Constructor
+     *
+     * @param[in] components Kernel component group from which the kernel will be generated
+     */
+    GpuCkwDriver(const GpuKernelComponentGroup &components);
+    /** Destructor */
+    ~GpuCkwDriver() override = default;
+    /** Generate kernel name */
+    std::string get_name() override;
+    /** Generate kernel code */
+    std::string get_code() override;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    std::string get_config_id() override;
+    /** Generate execution window */
+    Window get_window() const override;
+    /** Get the flat list of arguments of the kernel*/
+    GpuKernelArgumentList get_kernel_arguments() override;
+
+private:
+    GpuKernelComponentGroup          _components{};
+    std::string                      _kernel_name{};
+    std::vector<ckw::KernelArgument> _kernel_args{};
+    std::string                      _kernel_code{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
new file mode 100644
index 0000000000..ae12d13e5a
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(ckw::KernelWriter *writer)
+    : _writer(writer), _parent_id_space(writer->id_space())
+{
+    _writer->new_id_space();
+}
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other)
+    : _writer(other._writer), _parent_id_space(other._writer->id_space())
+{
+    _writer->new_id_space();
+}
+
+ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->()
+{
+    return _writer;
+}
+
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() const
+{
+    return _writer;
+}
+
+ckw::KernelWriter *GpuCkwScopedKernelWriter::writer()
+{
+    return _writer;
+}
+
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() const
+{
+    return _writer;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
new file mode 100644
index 0000000000..84dd706cd0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** Helper to automatically manage kernel writer ID space. */
+class GpuCkwScopedKernelWriter
+{
+public:
+    /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */
+    explicit GpuCkwScopedKernelWriter(ckw::KernelWriter *writer);
+
+    /** Create a new scope from the specified scoped kernel writer. */
+    GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other);
+
+    /** Assignment is disallowed. */
+    GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete;
+
+    /** Access the underlying kernel writer. */
+    ckw::KernelWriter *operator->();
+
+    /** Access the underlying kernel writer. */
+    const ckw::KernelWriter *operator->() const;
+
+    /** Get the kernel writer. */
+    ckw::KernelWriter *writer();
+
+    /** Get the kernel writer. */
+    const ckw::KernelWriter *writer() const;
+
+private:
+    ckw::KernelWriter *_writer;
+    int32_t            _parent_id_space;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
new file mode 100644
index 0000000000..66ccc1ac34
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include <sstream>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                                               GpuCkwScopedKernelWriter      &writer,
+                                                               const ITensorInfo             *tensor,
+                                                               const std::string             &alias)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+
+    // Do not re-declare if the variable associated with the tensor has already been declared
+    auto it = _vars.find(tensor->id());
+
+    if (it != _vars.end())
+    {
+        return &it->second;
+    }
+    if (comp_group.is_intermediate_tensor(tensor))
+    {
+        // Create a virtual tensor variable
+        GpuCkwComponentArgument var;
+        auto                  &&inserted = _vars.emplace(tensor->id(), var);
+        return &(inserted.first->second);
+    }
+    else
+    {
+        // Create a user tensor variable
+        std::stringstream ss;
+        ss << alias << "_t" << abs(tensor->id());
+        const auto              uniq_name = ss.str();
+        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor))};
+        auto                  &&inserted = _vars.emplace(tensor->id(), var);
+        return &(inserted.first->second);
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
new file mode 100644
index 0000000000..fc8764c3e2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuKernelComponentGroup;
+class GpuCkwScopedKernelWriter;
+
+/** A table of all the variables used in the kernel.
+ *
+ * It determines whether we create an virtual tensor var or a user tensor var
+ * It avoids duplicating variables for the same tensors (Tensors with the same id)
+ * Each kernel has exactly one variable table.
+ */
+class GpuCkwVariableTable
+{
+public:
+    /** Declare a kernel component variable(argument) for the corresponding tensor info.
+     *
+     * @param[in] comp_group Component group the tensor belongs to
+     * @param[in] writer     Compute Kernel Writer
+     * @param[in] tensor     Tensor info with which the new variable is associated
+     * @param[in] alias      Alias for the variable. Will be used as part of the variable name
+     *
+     * @return GpuCkwComponentArgument*
+     */
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              GpuCkwScopedKernelWriter      &writer,
+                                              const ITensorInfo             *tensor,
+                                              const std::string             &alias = "unnamed");
+
+private:
+    std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
new file mode 100644
index 0000000000..52e56e2e35
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
+
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuCkwVariableTable;
+class GpuCkwScopedKernelWriter;
+
+/** An interface used by @ref GpuCkwDriver to write source code for a kernel component
+ *
+ * There are 3 main architecture layers for using Compute Kernel Writer (Ckw) inside ACL's dynamic fusion module
+ * From top level to bottom level:
+ * | Layer          | Library
+ * ===========================
+ * | dynamic_fusion |   acl
+ * | ckw_driver     |   acl
+ * | ckw            |   ckw
+ *
+ * ckw_driver is a glue layer that directs how fused code is produced using the ckw library
+ *
+ * There are two main groups within ckw_driver:
+ * - @ref GpuCkwDriver is a global driver that coordinates how the final fused code along with all the info necessary
+ *   for run time execution is produced using ckw
+ * - Various classes implementing @ref IGpuCkwComponentDriver is a component driver that directs ckw to generate kernel component code (e.g. activation, store etc.)
+ *
+ * The overall flow goes like this:
+ * In dynamic_fusion module, @ref GpuLogicalKernel instantiates a @ref GpuCkwDriver from a @ref GpuKernelComponentGroup
+ * The logical kernel then uses the global driver's various interfaces to generate the code info.
+ * In particular, the @ref GpuCkwDriver::get_code() interface will call into each @ref IGpuCkwComponentDriver::write_component_code()
+ */
+class IGpuCkwComponentDriver
+{
+public:
+    using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuCkwComponentDriver()
+    {
+    }
+    /** Generate kernel component code
+     *
+     * @param[in]      comp_group Component group of which the component is a part of
+     * @param[in, out] vtable     Table of variables declared by each component
+     * @param[in, out] writer     CKW writer that writes code scoped to this kernel component.
+     *
+     *                            @note @p writer can only be passed via value since the new scope is created in the copy constructor
+     */
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const = 0;
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Generate the execution window for the component */
+    virtual Window get_window() const
+    {
+        return Window{};
+    }
+    /** Generate the name of the component
+     *
+     * This will be concatenated with other components' names to form the name of the kernel
+     */
+    virtual std::string get_name(const ComponentGroup &comp_group) const
+    {
+        ARM_COMPUTE_UNUSED(comp_group);
+        return "unnamed";
+    }
+    /** Generate the tuner id of the component
+     *  This id should capture all the parameters that distinguish one kernel's lws tuning from another.
+     *  e.g. two components that are identical in every other way, but have output tensor dimensions should
+     *  have different tuner ids, because the lws of one may not be optimal on the other.
+     *
+     * This will be concatenated with other components' tuner id to form the tuner id of the kernel
+     */
+    virtual std::string get_tuner_id(const ComponentGroup &comp_group) const
+    {
+        ARM_COMPUTE_UNUSED(comp_group);
+        return "";
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+
+private:
+    ComponentId               _id{-1};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
new file mode 100644
index 0000000000..18fda5bd6b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwActivation.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes) // NOLINT
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwActivation::write_component_code(const ComponentGroup    &comp_group,
+                                            GpuCkwVariableTable     &vtable,
+                                            GpuCkwScopedKernelWriter writer) const
+{
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h  = static_cast<int32_t>(_dst->dimension(1));
+    const auto dst_dt = to_ckw(_dst->data_type());
+
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_neg_1_fp  = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt));
+    auto const_pos_1_fp  = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt));
+    auto const_0_fp      = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_A_fp      = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt));
+    auto const_B_fp      = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
+
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    // Shift-back for the overlapping-min strategy
+    int32_t dst_shift_back = -1;
+
+    if (!dst->has_tile())
+    {
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
+    }
+    else
+    {
+        // dst_m0_partial depends on the TensorSamplerFormat
+        dst_n0         = dst->tile().tile_info().width();
+        dst_m0         = dst->tile().tile_info().height();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+        ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+        if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        }
+        else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            dst_m0_partial = _dst->dimension(1) % dst_m0;
+        }
+
+        // Shift-back for the overlapping-min strategy
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+    }
+
+    const auto &tile_dst = dst->tile();
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_dst_shift_back_n0 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    if (!src->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0,
+                                                const_0_i32);
+        get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0);
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+
+        auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+        src->init_virtual_tensor(tile_src, sampler_src);
+    }
+
+    const auto &tile_src = src->tile();
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    switch (_attributes.activation())
+    {
+        case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+        {
+            // dst = src * -1
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp);
+            // dst = exp(src * -1)
+            writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst);
+            // dst = 1 + (exp(src * -1))
+            writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp);
+            // dst = 1 /  1 + (exp(src * -1))
+            writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst);
+            break;
+        }
+        case ActivationLayerInfo::ActivationFunction::TANH:
+        {
+            writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_src);
+            break;
+        }
+        case ActivationLayerInfo::ActivationFunction::RELU:
+        {
+            // dst = max(src, 0)
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
+            break;
+        }
+        case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+        {
+            //dst = max(src, 0)
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
+            //dst = min(max(src, 0), A_VAL)
+            writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
+            break;
+        }
+        case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+        {
+            //dst = max(src, B_VAL)
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp);
+            //dst = min(max(src, B_VAL), A_VAL)
+            writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
+            break;
+        }
+        default:
+            CKW_ASSERT(false);
+            break;
+    }
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwActivation::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    TensorShape output_shape = _dst->tensor_shape();
+    // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+    // This is in line with the collapsing convention used by operators like Conv2d
+    output_shape.collapse(2U, 1U);
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
new file mode 100644
index 0000000000..386e933a72
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwActivation : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentActivation::Attributes;
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref GpuCkwActivation::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     */
+    GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation);
+    /** Destructor */
+    ~GpuCkwActivation() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
new file mode 100644
index 0000000000..d3e0dbafd4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwCast.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false,
+                             "The source data type must be a floating-point data type");
+}
+
+void GpuCkwCast::write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
+{
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
+
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    // Shift-back for the overlapping-min strategy
+    int32_t dst_shift_back = -1;
+
+    if (!dst->has_tile())
+    {
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        ckw::DataType dst_dt   = to_ckw(_dst->data_type());
+        auto          tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
+    }
+    else
+    {
+        // Change dst_n0 and dst_m0 if NOT root component!
+        // ATTENTION:
+        // dst_m0_partial depends on the TensorSamplerFormat
+        dst_n0         = dst->tile().tile_info().width();
+        dst_m0         = dst->tile().tile_info().height();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+        ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+        if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        }
+        else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            dst_m0_partial = _dst->dimension(1) % dst_m0;
+        }
+
+        // Shift-back for the overlapping-min strategy
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+    }
+
+    const auto &tile_dst = dst->tile();
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_dst_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    if (!src->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+        auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                                const_dst_shift_back_n0_i32, const_0_i32);
+        get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+        ckw::DataType src_dt   = to_ckw(_src->data_type());
+        auto          tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+
+        writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+        src->init_virtual_tensor(tile_src, sampler_src);
+    }
+
+    auto tile_src = src->tile();
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    // Only None ConvertPolicy is supported for floating-point data types
+    ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
+
+    writer->op_cast(tile_dst, tile_src, convert_policy);
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwCast::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    TensorShape output_shape = _dst->tensor_shape();
+    // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+    // This is in line with the collapsing convention used by operators like Conv2d
+    output_shape.collapse(2U, 1U);
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
new file mode 100644
index 0000000000..2389301196
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwCast : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentCast::Attributes;
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentCast::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     */
+    GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast);
+    /** Destructor */
+    ~GpuCkwCast() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..cfccab186b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId                      id,
+                                             const ArgumentPack<ITensorInfo> &tensors,
+                                             const Attributes                &attributes,
+                                             const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst);
+}
+
+void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup    &comp_group,
+                                                 GpuCkwVariableTable     &vtable,
+                                                 GpuCkwScopedKernelWriter writer) const
+{
+    // Data Layout is NHWC
+    const uint32_t width_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
+
+    if (using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+    }
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt           = to_ckw(_dst->data_type());
+    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_w            = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h            = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_h            = static_cast<int32_t>(_dst->dimension(height_idx));
+    const auto stride_x         = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y         = static_cast<int32_t>(_attributes.stride().y());
+    const auto pad_x            = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y            = static_cast<int32_t>(_attributes.pad().top);
+    const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier());
+    const auto dilation_x       = static_cast<int32_t>(_attributes.dilation().x());
+    const auto dilation_y       = static_cast<int32_t>(_attributes.dilation().y());
+    const auto kernel_size      = kernel_width * kernel_height;
+
+    // CKW constants
+    auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+    auto const_kernel_size_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+    auto const_dst_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_src_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_0_i32        = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_neg_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+    auto const_depth_multiplier_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32));
+    auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32));
+    auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32));
+    auto const_0_fp           = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    const int32_t src_m0 = kernel_width + (dst_m0 - 1);
+    const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0;
+    const int32_t wei_m0 = kernel_width;
+    const int32_t wei_n0 = dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize the destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensors
+     ********************************************************************************/
+    // SOURCE SAMPLER
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // WEIGHTS SAMPLER
+    // We cannot have out-of-bounds accesses for the weights
+    ckw::TensorSampler sampler_wei;
+    sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    if (_settings.export_weights_to_cl_image())
+    {
+        sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+    }
+    else
+    {
+        sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
+
+    // BIAS SAMPLER
+    ckw::TensorSampler sampler_bia;
+    sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+    sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (Optional)
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+    writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32);
+    writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32);
+
+    auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32);
+    writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+    writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32);
+
+    auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32);
+    writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+    writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32);
+
+    // Loop variables
+    auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32);
+
+    writer->op_assign(tile_yk, const_0_i32);
+
+    // clang-format off
+    writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32,
+    [&]()
+    {
+        auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0));
+        auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0));
+
+        writer->op_assign(tile_src, const_0_fp);
+
+        auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_x_lt_w  = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_lt_h  = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32));
+
+        // Check if yi + yk * DILATION_Y is out-of-bound
+        writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32);
+        writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32);
+
+        auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32));
+
+        // Load src
+        for(int32_t xk = 0; xk < src_m0; ++xk)
+        {
+            auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32));
+
+            // xi + xk * DILATION_X
+            writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32);
+            writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi);
+
+            // Check if xi + xk * DILATION_X is out-of-bound
+            writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32);
+            writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32);
+
+            // Set mi to -1 if we have out-of-bound memory accesses
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h);
+
+            writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0);
+        }
+
+        // Load wei
+        writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32);
+
+        // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW
+        // Therefore, this part should be replaced with the MAC ternary operator when availabe
+        auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+        for(int32_t m0 = 0; m0 < dst_m0; ++m0)
+        {
+            for(int32_t xk = 0; xk < kernel_width; ++xk)
+            {
+                auto tile_a = tile_src.row(m0 + xk);
+                auto tile_b = tile_wei.row(xk);
+                auto tile_c = tile_dst.row(m0);
+
+                writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b);
+                writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp);
+            }
+        }
+        writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32);
+    });
+    // clang-format on
+
+    // Bias addition
+    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+    if (using_bias)
+    {
+        if (!bia->has_tile())
+        {
+            auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+            writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32);
+            bia->init_virtual_tensor(tile_bia, sampler_bia);
+        }
+        auto &tile_bia = bia->tile();
+
+        writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwDepthwiseConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+    TensorShape output_shape = _dst->tensor_shape();
+
+    Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0()));
+    return win.collapse(win, Window::DimZ);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
new file mode 100644
index 0000000000..a15d3ee710
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+class GpuCkwDepthwiseConv2d : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentDepthwiseConv2d::Attributes;
+    using Settings   = ClComponentDepthwiseConv2d::Settings;
+
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentDepthwiseConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    GpuCkwDepthwiseConv2d(ComponentId                      id,
+                          const ArgumentPack<ITensorInfo> &tensors,
+                          const Attributes                &attributes,
+                          const Settings                  &settings);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDepthwiseConv2d);
+    /** Destructor */
+    ~GpuCkwDepthwiseConv2d() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_wei;
+    const ITensorInfo *_bia;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
new file mode 100644
index 0000000000..eb4f644eb6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+using TileContainer = std::vector<std::vector<int32_t>>;
+
+GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
+                                       const ArgumentPack<ITensorInfo> &tensors,
+                                       const Attributes                &attributes,
+                                       const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
+}
+
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
+{
+    const auto desc = _settings.direct_conv_descriptor();
+    ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
+                             "Only the weights tensor can be exported to cl_image");
+
+    const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const uint32_t width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
+
+    if (using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+    }
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt        = to_ckw(_dst->data_type());
+    const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width  = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_c         = static_cast<int32_t>(_src->dimension(channel_idx));
+    const auto src_w         = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h         = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_w         = static_cast<int32_t>(_dst->dimension(width_idx));
+    const auto stride_x      = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y      = static_cast<int32_t>(_attributes.stride().y());
+    const auto pad_x         = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y         = static_cast<int32_t>(_attributes.pad().top);
+    const auto kernel_size   = kernel_width * kernel_height;
+    const auto k0 =
+        static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)));
+
+    // CKW constants
+    auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+    auto const_kernel_size_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+    auto const_src_c_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32));
+    auto const_src_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32));
+    auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_k0_i32       = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+    auto const_0_i32        = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_neg_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+    auto const_0_fp         = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_src_c_i32_minus_k0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // Exporting the weights tensor to an OpenCL image object is currently only supported when:
+    //   a) k0 is equal to 4
+    // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
+    //   b) K is a multiple of 4
+    // This is a limitation in the current interface due to the variable table being responsible for maintaining
+    // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
+    // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
+    // extra loop to compute the left-over elements.
+    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0);
+
+    // SOURCE SAMPLER
+    // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+    //   compute left-over elements
+    // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the
+    //   indirection buffer mi does not contain negative values representing out-of-bounds reads.
+    auto address_mode_y_src =
+        kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero;
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(address_mode_y_src);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // WEIGHTS SAMPLER
+    // We cannot have out-of-bounds accesses for the weights
+    ckw::TensorSampler sampler_wei;
+    sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+    sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    if (use_cl_image_for_weights)
+    {
+        sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+    }
+    else
+    {
+        sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
+
+    // BIAS SAMPLER
+    ckw::TensorSampler sampler_bia;
+
+    if (using_bias)
+    {
+        sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+        sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT
+    auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32);
+    get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    // We create a 2d container of size (dst_m0, 1) to store the indices for iteration
+    TileContainer it;
+    for (int32_t m = 0; m < dst_m0; ++m)
+    {
+        std::vector<int32_t> idx{m};
+        it.push_back({idx});
+    }
+
+    const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32));
+
+    auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+    auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+    // Convert the linear index to coordinate
+    // xi = ((mout + i) % dst_w) * stride_x - pad_x
+    // yi = ((mout + i) / dst_w) * stride_y - pad_y
+    writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32);
+
+    auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32));
+    writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32);
+
+    auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32));
+    writer->op_assign(tile_i, const_0_i32);
+
+    // clang-format off
+    writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+    {
+        auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32);
+        writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32);
+
+        auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32));
+        writer->op_assign(tile_ck, const_0_i32);
+
+        // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
+        // x_s = xi + x_k
+        // y_s = yi + y_k
+        // mi = x_s + y_s * width;
+        // mi = select(-1, mi, x_s >= 0);
+        // mi = select(-1, mi, x_s < width);
+        // mi = select(-1, mi, y_s >= 0);
+        // mi = select(-1, mi, y_s < height);
+        auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+        auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_xs_lt_w  = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys_lt_h  = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+        writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k);
+        writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k);
+        writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32);
+        writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs);
+        writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32);
+        writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32);
+        writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32);
+        writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h);
+
+        writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]()
+        {
+            auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0));
+            auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0));
+            writer->op_assign(tile_lhs, const_0_fp);
+            writer->op_assign(tile_rhs, const_0_fp);
+
+            writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+            writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
+
+            writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+        });
+
+        // Left-over accumulations for when K is not a multiple of k0
+        if(((src_c % k0) != 0))
+        {
+            writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+            {
+                auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1));
+                auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1));
+                writer->op_assign(tile_lhs, const_0_fp);
+                writer->op_assign(tile_rhs, const_0_fp);
+
+                writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+                writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
+
+                writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+            });
+        }
+
+        writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32);
+    });
+    // clang-format on
+
+    // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of
+    // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+    if (using_bias)
+    {
+        if (!bia->has_tile())
+        {
+            auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+            writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32);
+            bia->init_virtual_tensor(tile_bia, sampler_bia);
+        }
+        auto &tile_bia = bia->tile();
+
+        writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto dst_shape = _dst->tensor_shape();
+    const auto desc      = _settings.direct_conv_descriptor();
+
+    const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+    const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+
+    Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0));
+    win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    return "direct_conv2d";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
new file mode 100644
index 0000000000..139cf620e2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwDirectConv2d : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+
+public:
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+     * @param[in] settings   Component settings. Settings are a set of parameters that influence the implementation of a component
+     */
+    GpuCkwDirectConv2d(ComponentId                      id,
+                       const ArgumentPack<ITensorInfo> &tensors,
+                       const Attributes                &attributes,
+                       const Settings                  &settings);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
+    /** Destructor */
+    ~GpuCkwDirectConv2d() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_wei;
+    const ITensorInfo *_bia;
+    const ITensorInfo *_dst;
+
+    Attributes _attributes;
+    Settings   _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
new file mode 100644
index 0000000000..fb55acad53
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwElementwiseBinary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/ConstantData.h"
+#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId                      id,
+                                                 const ArgumentPack<ITensorInfo> &tensors,
+                                                 const Attributes                &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
+{
+    _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup    &comp_group,
+                                                   GpuCkwVariableTable     &vtable,
+                                                   GpuCkwScopedKernelWriter writer) const
+{
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
+
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    if (!dst->has_tile())
+    {
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        ckw::DataType dst_dt   = to_ckw(_dst->data_type());
+        auto          tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
+    }
+    else
+    {
+        // Change dst_n0 and dst_m0 if NOT root component!
+        dst_n0 = dst->tile().tile_info().width();
+        dst_m0 = dst->tile().tile_info().height();
+
+        // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial
+        // because if we enter this condition it means that the element-wise op is not the
+        // root component and the address modes have been already set.
+    }
+
+    const auto &tile_dst = dst->tile();
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // ...
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // Check whether the lhs tensor is a tile or tensor
+    // If it is a tile, create a sampler and load the content in a tile
+    if (!lhs->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_lhs = dst->tensor_sampler();
+
+        bool broadcast_x = false;
+        bool broadcast_y = false;
+
+        int32_t lhs_n0 = dst_n0;
+        int32_t lhs_m0 = dst_m0;
+
+        // Check whether we have broadcasting
+        // In case of broadcast, lhs can only be a vector or scalar.
+        // Broadcasting in other dimensions is not supported
+        if (_dst->dimension(0) != _lhs->dimension(0))
+        {
+            broadcast_x = true;
+            lhs_n0      = 1;
+        }
+
+        if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2))
+            {
+                broadcast_y = true;
+                lhs_m0      = 1;
+            }
+        }
+        else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            if (_dst->dimension(1) != _lhs->dimension(1))
+            {
+                broadcast_y = true;
+                lhs_m0      = 1;
+            }
+        }
+
+        const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0;
+        const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0;
+
+        // Constants
+        auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32));
+        auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32));
+        auto const_lhs_shift_back_n0_i32 =
+            writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32));
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        if (!broadcast_x)
+        {
+            get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32,
+                                                    const_lhs_shift_back_n0_i32, const_0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_cout0, const_0_i32);
+        }
+
+        if (!broadcast_y)
+        {
+            get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_mout0, const_0_i32);
+        }
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            // For tile_mout1 and tile_bout0 the step can only be 1
+            if (!broadcast_y)
+            {
+                writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            }
+            else
+            {
+                // If broadcast_y == true, it means that we have either a scalar or vector
+                // because broadcasting in other dimensions is not supported
+                writer->op_assign(tile_mout1, const_0_i32);
+            }
+
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+
+        ckw::DataType lhs_dt   = to_ckw(_lhs->data_type());
+        auto          tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0));
+
+        writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement
+        lhs->init_virtual_tensor(tile_lhs, sampler_lhs);
+    }
+
+    // Check whether the rhs tensor is a tile or tensor
+    // If it is a tile, create a sampler and load the content in a tile
+    if (!rhs->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_rhs = dst->tensor_sampler();
+
+        bool broadcast_x = false;
+        bool broadcast_y = false;
+
+        int32_t rhs_n0 = dst_n0;
+        int32_t rhs_m0 = dst_m0;
+
+        // Check whether we have broadcasting
+        // In case of broadcast, rhs can only be a vector or scalar.
+        // Broadcasting in other dimensions is not supported
+        if (_dst->dimension(0) != _rhs->dimension(0))
+        {
+            broadcast_x = true;
+            rhs_n0      = 1;
+        }
+
+        if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2))
+            {
+                broadcast_y = true;
+                rhs_m0      = 1;
+            }
+        }
+        else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            if (_dst->dimension(1) != _rhs->dimension(1))
+            {
+                broadcast_y = true;
+                rhs_m0      = 1;
+            }
+        }
+
+        const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0;
+        const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0;
+
+        // Constants
+        auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32));
+        auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32));
+        auto const_rhs_shift_back_n0_i32 =
+            writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32));
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        if (!broadcast_x)
+        {
+            get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32,
+                                                    const_rhs_shift_back_n0_i32, const_0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_cout0, const_0_i32);
+        }
+
+        if (!broadcast_y)
+        {
+            get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_mout0, const_0_i32);
+        }
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            // For tile_mout1 and tile_bout0 the step can only be 1
+            const auto src_w       = static_cast<int32_t>(_rhs->dimension(1));
+            auto       const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+            if (!broadcast_y)
+            {
+                writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w);
+            }
+            else
+            {
+                // If broadcast_y == true, it means that we have either a scalar or vector
+                // because broadcasting in other dimensions is not supported
+                writer->op_assign(tile_mout1, const_0_i32);
+            }
+
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w);
+        }
+
+        ckw::DataType rhs_dt   = to_ckw(_rhs->data_type());
+        auto          tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0));
+
+        writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement
+        rhs->init_virtual_tensor(tile_rhs, sampler_rhs);
+    }
+
+    const auto &tile_lhs = lhs->tile();
+    const auto &tile_rhs = rhs->tile();
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    // Perform the element-wise operation
+    writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwElementwiseBinary::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    TensorShape output_shape = _dst->tensor_shape();
+    // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+    // This is in line with the collapsing convention used by operators like Conv2d
+    output_shape.collapse(2U, 1U);
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+    return win;
+}
+
+std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    const std::vector<std::string> build_params = {
+        "elementwise_binary",
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
+    };
+    return join(build_params, "_");
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
new file mode 100644
index 0000000000..c6cbba28d3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwElementwiseBinary : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentElementwiseBinary::Attributes;
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentElementwiseBinary::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     */
+    GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary);
+    /** Destructor */
+    ~GpuCkwElementwiseBinary() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_lhs;
+    const ITensorInfo *_rhs;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
new file mode 100644
index 0000000000..14ad3847fc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "support/StringSupport.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwMatMul::GpuCkwMatMul(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+    _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwMatMul::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
+{
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto k =
+        _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0));
+    const auto k0     = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k));
+    const auto dst_dt = to_ckw(_dst->data_type());
+
+    // CKW constants
+    auto const_k_i32          = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32));
+    auto const_k0_i32         = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+    auto const_0_i32          = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32      = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_fp           = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // LHS SAMPLER
+    // The assumption here is that M is multiple of M0. This limitation will be removed once
+    // we have the support for OverlappingMin as address mode for the Y direction
+    ckw::TensorSampler sampler_lhs;
+    sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // RHS SAMPLER
+    ckw::TensorSampler sampler_rhs;
+    sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index
+    auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index
+    auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32);
+    get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index
+
+    writer->op_assign(tile_idx_k, const_0_i32);
+
+    // clang-format off
+    writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32,
+    [&]()
+    {
+        auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0));
+        auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0));
+        writer->op_assign(tile_lhs, const_0_fp);
+        writer->op_assign(tile_rhs, const_0_fp);
+
+        writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+        writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+        writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+
+    });
+
+    // Left-over accumulations for when K is not a multiple of k0
+    if(((k % k0) != 0))
+    {
+        writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+        {
+            auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1));
+            auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1));
+            writer->op_assign(tile_lhs, const_0_fp);
+            writer->op_assign(tile_rhs, const_0_fp);
+
+            writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+            writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+            writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+        });
+    }
+    // clang-format on
+}
+
+Window GpuCkwMatMul::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const int32_t m       = _dst->dimension(1);
+    const int32_t n       = _dst->dimension(0);
+    const bool    adj_lhs = _attributes.adj_lhs();
+
+    const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+    const int32_t n0 = adjust_vec_size(_settings.n0(), n);
+
+    // Configure kernel window
+    Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
+    win        = win.collapse(win, Window::DimZ);
+
+    return win;
+}
+
+std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    std::string kernel_name("mat_mul_native");
+
+    const int32_t m = _dst->dimension(1);
+    const int32_t n = _dst->dimension(0);
+    const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+
+    kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
+    kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(m);
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(n);
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(k);
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_dst->dimension(2));
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_settings.m0());
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_settings.n0());
+    kernel_name += "_";
+    kernel_name += support::cpp11::to_string(_settings.k0());
+
+    return kernel_name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
new file mode 100644
index 0000000000..790418bf50
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwMatMul final : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentMatMul::Attributes;
+    using Settings   = ClComponentMatMul::Settings;
+
+public:
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentMatMul::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+     * @param[in] settings   Component settings. Settings are a set of parameters that influence the implementation of a component
+     */
+    GpuCkwMatMul(ComponentId                      id,
+                 const ArgumentPack<ITensorInfo> &tensors,
+                 const Attributes                &attributes,
+                 const Settings                  &settings);
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwMatMul);
+
+    /** Destructor */
+    ~GpuCkwMatMul() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_lhs;
+    const ITensorInfo *_rhs;
+    const ITensorInfo *_dst;
+
+    Attributes _attributes;
+    Settings   _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
new file mode 100644
index 0000000000..d027f348ef
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwPool2d::GpuCkwPool2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
+
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwPool2d::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
+{
+    const uint32_t width_idx  = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt    = to_ckw(_dst->data_type());
+    const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x());
+    const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y());
+    const auto pad_x     = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y     = static_cast<int32_t>(_attributes.pad().top);
+    const auto stride_x  = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y  = static_cast<int32_t>(_attributes.stride().y());
+    const auto src_w     = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h     = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_h     = static_cast<int32_t>(_dst->dimension(height_idx));
+
+    // CKW constants
+    auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32));
+    auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32     = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32     = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_stride_x_i32  = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32  = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_src_w_i32     = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32     = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32     = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_0_i32         = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32     = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_fp          = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_lowest_val_fp =
+        writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32));
+    auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+    // Check if it is global pooling
+    const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0);
+
+    // Accumulate always in F32 if the pool type is not MAX
+    const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) ||
+                         ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX);
+
+    const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16;
+
+    const bool is_wider_acc = dst_dt != acc_dt;
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+    writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling.
+    auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
+
+    // Initialise result tile with appropriate value
+    if (_attributes.pool_type() == PoolingType::MAX)
+    {
+        if (_settings.use_inf_as_limit())
+        {
+            writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None);
+        }
+        else
+        {
+            writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None);
+        }
+    }
+    else
+    {
+        writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None);
+    }
+
+    // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X
+    auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
+    writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+    writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32);
+
+    // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y
+    auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
+    writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+    writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32);
+
+    auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32);
+    auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32);
+
+    writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start);
+    writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start);
+
+    // int pool_x_s = max((int)0, -idx_in_w);
+    // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    // int pool_y_s = max((int)0, -idx_in_h);
+    // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+    auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
+    auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
+    auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+    auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
+
+    writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start);
+    writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start);
+    writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e);
+    writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start);
+    writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start);
+    writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e);
+
+    // #if defined(EXCLUDE_PADDING)
+    // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+    // #else // defined(EXCLUDE_PADDING)
+    // int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+    // #endif // defined(EXCLUDE_PADDING)
+    auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
+    if (_attributes.exclude_padding())
+    {
+        auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+        auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
+
+        writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s);
+        writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s);
+        writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff);
+    }
+    else
+    {
+        writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32);
+    }
+
+    auto tile_x = writer->declare_tile("x", ckw::DataType::Int32);
+    auto tile_y = writer->declare_tile("y", ckw::DataType::Int32);
+
+    if (is_global_pooling)
+    {
+        writer->op_assign(tile_y, const_0_i32);
+        writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32);
+    }
+    else
+    {
+        writer->op_assign(tile_y, tile_pool_y_s);
+    }
+
+    // Y dim for-loop
+    writer->op_for_loop(
+        tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32,
+        [&]()
+        {
+            // Reset the iterator for the inner loop
+            if (is_global_pooling)
+            {
+                writer->op_assign(tile_x, const_0_i32);
+                writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32);
+            }
+            else
+            {
+                writer->op_assign(tile_x, tile_pool_x_s);
+            }
+
+            auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32);
+            writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y);
+
+            // X dim for-loop
+            writer->op_for_loop(
+                tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32,
+                [&]()
+                {
+                    auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32);
+                    writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x);
+
+                    ckw::DataType src_dt   = to_ckw(_src->data_type());
+                    auto          tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
+
+                    // Load src tile
+                    if (is_wider_acc)
+                    {
+                        auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+                        writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+                                        tile_src_coord_y, tile_bout0);
+                        writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None);
+                    }
+                    else
+                    {
+                        writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+                                        tile_src_coord_y, tile_bout0);
+                    }
+
+                    // Take the square of the input, for L2 Pooling
+                    if (_attributes.pool_type() == PoolingType::L2)
+                    {
+                        writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src);
+                    }
+
+                    // Perfom Pooling op
+                    if (_attributes.pool_type() == PoolingType::MAX)
+                    {
+                        writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src);
+                    }
+                    else
+                    {
+                        writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src);
+                    }
+                });
+        });
+
+    if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
+    {
+        // Filter_size is automatically broadcasted in the operation
+        auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt));
+        writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None);
+        writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp);
+    }
+
+    // Take square root of the result in L2 pooling
+    if (_attributes.pool_type() == PoolingType::L2)
+    {
+        writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res);
+    }
+
+    // Store the results and do casting if mixed precision
+    if (is_wider_acc)
+    {
+        writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None);
+    }
+    else
+    {
+        writer->op_assign(tile_dst, tile_res);
+    }
+}
+
+Window GpuCkwPool2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    TensorShape    output_shape = _dst->tensor_shape();
+    const uint32_t vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    // Create and configure kernel window
+    auto win = calculate_max_window(output_shape, Steps(vec_size));
+    win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
+    return win;
+}
+
+std::string GpuCkwPool2d::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    return "pool2dMxN";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
new file mode 100644
index 0000000000..822282a108
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwPool2d : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentPool2d::Attributes;
+    using Settings   = ClComponentPool2d::Settings;
+
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentCast::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    GpuCkwPool2d(ComponentId                      id,
+                 const ArgumentPack<ITensorInfo> &tensors,
+                 const Attributes                &attributes,
+                 const Settings                  &settings);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwPool2d);
+    /** Destructor */
+    ~GpuCkwPool2d() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
new file mode 100644
index 0000000000..edd7ea9a38
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr uint32_t opencl_vector_size_in_bytes = 16;
+} // namespace
+
+GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
+{
+    const uint32_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto  dst_dt  = to_ckw(_dst->data_type());
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
+    const auto  src_w   = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto  src_h   = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto  dst_h   = static_cast<int32_t>(_dst->dimension(height_idx));
+
+    // CKW constants
+    auto const_src_w_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32  = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32      = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_0_fp       = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+    auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+    auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+
+    // dst_m0 must be 1
+    ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+    sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+
+    // ....
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    writer->op_assign(tile_xo, tile_gid_1);
+    writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+    switch (_attributes.sampling_policy())
+    {
+        case SamplingPolicy::TOP_LEFT:
+            // xi_f = (xo * scale_x)
+            // yi_f = (yo * scale_y)
+            writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
+            break;
+        case SamplingPolicy::CENTER:
+        {
+            // xi_f = ((xo + 0.5f) * scale_x)
+            // yi_f = ((yo + 0.5f) * scale_y)
+            const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+            const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+            writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported sampling policy");
+    }
+
+    if (_attributes.align_corners())
+    {
+        writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f);
+        writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f);
+    }
+
+    // xi0 = clamp((int)xi_f, 0, (int)src_w - 1)
+    // yi0 = clamp((int)yi_f, 0, (int)src_h - 1)
+    auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
+    auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
+
+    writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
+
+    auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+
+    writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+    writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+    auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+
+    writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1);
+
+    auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0));
+    writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+
+    writer->op_assign(tile_dst, tile_src);
+}
+
+void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
+{
+    const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto  dst_dt  = to_ckw(_dst->data_type());
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
+    const auto  src_w   = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto  src_h   = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto  dst_h   = static_cast<int32_t>(_dst->dimension(height_idx));
+
+    // CKW constants
+    auto const_src_w_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32  = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32      = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_0_fp       = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_pos_1_fp   = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32));
+    auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+    auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+    auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+
+    // dst_m0 must be 1
+    ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+    sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+
+    // ....
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    writer->op_assign(tile_xo, tile_gid_1);
+    writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+    switch (_attributes.sampling_policy())
+    {
+        case SamplingPolicy::TOP_LEFT:
+            // xi_f = (xo * scale_x)
+            // yi_f = (yo * scale_y)
+            writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
+            break;
+        case SamplingPolicy::CENTER:
+        {
+            // xi_f = ((xo + 0.5f) * scale_x - 0.5f)
+            // yi_f = ((yo + 0.5f) * scale_y - 0.5f)
+            const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+            const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+            writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
+
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported sampling policy");
+    }
+
+    // xi = (int)floor(xi_f);
+    // yi = (int)floor(yi_f);
+    auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
+    auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
+    writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f);
+    writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f);
+
+    auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
+    auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
+    writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
+
+    // xi0  = clamp(xi, 0, (int)src_w - 1);
+    // yi0  = clamp(yi, 0, (int)src_h - 1);
+    // xi1  = clamp(xi + 1, 0, (int)src_w - 1);
+    // yi1  = clamp(yi + 1, 0, (int)src_h - 1);
+    auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+    writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+    writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+    auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
+    auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
+    writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32);
+    writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32);
+
+    auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+    auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
+    auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
+
+    writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1);
+    writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1);
+
+    auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+    writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo);
+    writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo);
+    writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo);
+
+    // Weights of each nearest pixel
+    auto tile_a  = writer->declare_tile("a", ckw::DataType::Fp32);
+    auto tile_b  = writer->declare_tile("b", ckw::DataType::Fp32);
+    auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
+    auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
+
+    // a = (xi_f - (float)xi)
+    // b = (1.f - a)
+    // a1 = (yi_f - (float)yi)
+    // b1 = (1.f - a1)
+    auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
+    auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
+    writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
+
+    writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float);
+    writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a);
+    writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float);
+    writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1);
+
+    // Cast weights to source type
+    const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
+    const auto &tile_b_src_type  = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
+    const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
+    const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
+
+    writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
+
+    // in00 * b * b1
+    writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type);
+    writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type);
+
+    // in01 * a * b1
+    writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type);
+    writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type);
+
+    // in10 * b * a1
+    writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type);
+    writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type);
+
+    // in11 * a * a1
+    writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type);
+    writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type);
+
+    // Summation of above terms
+    writer->op_assign(tile_dst, tile_in00);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11);
+}
+
+void GpuCkwResize::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
+{
+    switch (_attributes.interpolation_policy())
+    {
+        case InterpolationPolicy::NEAREST_NEIGHBOR:
+            do_nearest_neighbor_resize(comp_group, vtable, writer);
+            break;
+        case InterpolationPolicy::BILINEAR:
+            do_bilinear_resize(comp_group, vtable, writer);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported interpolation policy");
+    }
+}
+
+Window GpuCkwResize::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const uint32_t n0  = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
+    Window         win = calculate_max_window(*_dst, Steps(n0));
+    return win.collapse(win, Window::DimZ);
+}
+
+std::string GpuCkwResize::get_tuner_id(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    std::string tuner_id = "resize_";
+    tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+    tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+    tuner_id += "_";
+    tuner_id += _attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft";
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(0));
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(1));
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(2));
+    tuner_id += "_";
+    tuner_id += support::cpp11::to_string(_dst->dimension(3));
+
+    return tuner_id;
+}
+
+std::string GpuCkwResize::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    std::string name = "resize_";
+    name += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+    name += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+
+    return name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
new file mode 100644
index 0000000000..1266c05921
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwResize final : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentResize::Attributes;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     */
+    GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwResize);
+
+    /** Destructor */
+    ~GpuCkwResize() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
+
+private:
+    /** Resize using nearest neighbor interpolation
+     *
+     * @param[in]      comp_group Component group to which this component belongs to
+     * @param[in, out] vtable     Table of variables declared by this component
+     * @param[in, out] writer     CKW writer that writes code scoped to this kernel component
+     */
+    void do_nearest_neighbor_resize(const ComponentGroup    &comp_group,
+                                    GpuCkwVariableTable     &vtable,
+                                    GpuCkwScopedKernelWriter writer) const;
+
+    /** Resize using bilinear interpolation
+     *
+     * @param[in]      comp_group Component group to which this component belongs to
+     * @param[in, out] vtable     Table of variables declared by this component
+     * @param[in, out] writer     CKW writer that writes code scoped to this kernel component
+     */
+    void do_bilinear_resize(const ComponentGroup    &comp_group,
+                            GpuCkwVariableTable     &vtable,
+                            GpuCkwScopedKernelWriter writer) const;
+
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
new file mode 100644
index 0000000000..d9d741fea5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwStore.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+void GpuCkwStore::write_component_code(const ComponentGroup    &comp_group,
+                                       GpuCkwVariableTable     &vtable,
+                                       GpuCkwScopedKernelWriter writer) const
+{
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(2));
+
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    const auto &tile_src    = src->tile();
+    auto       &sampler_src = src->tensor_sampler();
+
+    const auto    dst_n0         = static_cast<int32_t>(tile_src.tile_info().width());
+    const auto    dst_m0         = static_cast<int32_t>(tile_src.tile_info().height());
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensor
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32);
+
+    // Get the boundary aware coordinates at each global dimension index
+    if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+    {
+        writer->op_assign(tile_mout1, const_0_i32);
+        get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+    }
+    else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+    {
+        // For tile_mout1 and tile_bout0 the step can only be 1
+        writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+        writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+    }
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+}
+
+std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    return "store";
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
new file mode 100644
index 0000000000..c9ce7eb269
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwStore : public IGpuCkwComponentDriver
+{
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the component
+     */
+    GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwStore);
+    /** Destructor */
+    ~GpuCkwStore() override = default;
+    // Inherited methods overriden:
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
new file mode 100644
index 0000000000..1e6f0841ad
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CkwHelper.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+                             ckw::TileOperand        &coord,
+                             const ckw::TileOperand  &gid,
+                             ckw::TileOperand        &step)
+{
+    writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+}
+
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+                                             ckw::TileOperand        &coord,
+                                             const ckw::TileOperand  &gid,
+                                             ckw::TileOperand        &step,
+                                             ckw::TileOperand        &shift_back,
+                                             ckw::TileOperand        &const_0)
+{
+    // Applied formula: max((gid * step) - shift_back, 0)
+    // where the shift_back operand is: (step - leftover_step) % step
+
+    writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+    writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back);
+    writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
new file mode 100644
index 0000000000..956e7c8ecb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Get coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out]    coord  Resultant coordinate
+ * @param[in]     gid    Global work item id
+ * @param[in]     step   Step size / vector size
+ */
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+                             ckw::TileOperand        &coord,
+                             const ckw::TileOperand  &gid,
+                             ckw::TileOperand        &step);
+
+/** Get boundary aware coordinate along one axis.
+ *
+ * @param[in,out] writer     Writer
+ * @param[out]    coord      Resultant coordinate
+ * @param[in]     gid        Global work item id
+ * @param[in]     step       Step size / vector size
+ * @param[in]     shift_back It is (step - leftover_step) % step
+ * @param[in]     const_0    Constant tile of value 0
+ */
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+                                             ckw::TileOperand        &coord,
+                                             const ckw::TileOperand  &gid,
+                                             ckw::TileOperand        &step,
+                                             ckw::TileOperand        &shift_back,
+                                             ckw::TileOperand        &const_0);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
new file mode 100644
index 0000000000..ad31b06362
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::DataType to_ckw(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::F32:
+            return ckw::DataType::Fp32;
+        case DataType::F16:
+            return ckw::DataType::Fp16;
+        case DataType::S32:
+            return ckw::DataType::Int32;
+        case DataType::S16:
+            return ckw::DataType::Int16;
+        case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
+            return ckw::DataType::Int8;
+        case DataType::U32:
+            return ckw::DataType::Uint32;
+        case DataType::U16:
+            return ckw::DataType::Uint16;
+        case DataType::U8:
+        case DataType::QASYMM8:
+            return ckw::DataType::Uint8;
+        default:
+            return ckw::DataType::Unknown;
+    }
+}
+
+ckw::TensorShape to_ckw(const TensorShape &shape)
+{
+    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
+    /// NOTE: Overflow danger. Use size_t?
+    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+                            static_cast<int32_t>(shape[4])};
+}
+
+ckw::TensorDataLayout to_ckw(DataLayout dl)
+{
+    switch (dl)
+    {
+        case DataLayout::NHWC:
+            return ckw::TensorDataLayout::Nhwc;
+        case DataLayout::NDHWC:
+            return ckw::TensorDataLayout::Ndhwc;
+        default:
+            return ckw::TensorDataLayout::Unknown;
+    }
+}
+
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
+{
+    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
+}
+
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+    switch (storage)
+    {
+        case TensorStorageType::ClBufferUint8Ptr:
+            return ckw::TensorStorageType::BufferUint8Ptr;
+        case TensorStorageType::ClImage2dReadOnly:
+            return ckw::TensorStorageType::Texture2dReadOnly;
+        case TensorStorageType::ClImage2dWriteOnly:
+            return ckw::TensorStorageType::Texture2dWriteOnly;
+        case TensorStorageType::Unknown:
+            return ckw::TensorStorageType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor storage type");
+    }
+}
+
+TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+    switch (component)
+    {
+        case ckw::TensorComponentType::OffsetFirstElement:
+            return TensorComponentType::OffsetFirstElement;
+        case ckw::TensorComponentType::Stride0:
+            return TensorComponentType::Stride0;
+        case ckw::TensorComponentType::Stride1:
+            return TensorComponentType::Stride1;
+        case ckw::TensorComponentType::Stride2:
+            return TensorComponentType::Stride2;
+        case ckw::TensorComponentType::Stride3:
+            return TensorComponentType::Stride3;
+        case ckw::TensorComponentType::Stride4:
+            return TensorComponentType::Stride4;
+        case ckw::TensorComponentType::Dim0:
+            return TensorComponentType::Dim0;
+        case ckw::TensorComponentType::Dim1:
+            return TensorComponentType::Dim1;
+        case ckw::TensorComponentType::Dim2:
+            return TensorComponentType::Dim2;
+        case ckw::TensorComponentType::Dim3:
+            return TensorComponentType::Dim3;
+        case ckw::TensorComponentType::Dim4:
+            return TensorComponentType::Dim4;
+        case ckw::TensorComponentType::Dim1xDim2:
+            return TensorComponentType::Dim1xDim2;
+        case ckw::TensorComponentType::Dim2xDim3:
+            return TensorComponentType::Dim2xDim3;
+        case ckw::TensorComponentType::Dim1xDim2xDim3:
+            return TensorComponentType::Dim1xDim2xDim3;
+        case ckw::TensorComponentType::Unknown:
+            return TensorComponentType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+    }
+}
+
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+    switch (storage)
+    {
+        case ckw::TensorStorageType::BufferUint8Ptr:
+            return TensorStorageType::ClBufferUint8Ptr;
+        case ckw::TensorStorageType::Texture2dReadOnly:
+            return TensorStorageType::ClImage2dReadOnly;
+        case ckw::TensorStorageType::Texture2dWriteOnly:
+            return TensorStorageType::ClImage2dWriteOnly;
+        case ckw::TensorStorageType::Unknown:
+            return TensorStorageType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
new file mode 100644
index 0000000000..26740cdd04
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include "compute_kernel_writer/include/ckw/TensorInfo.h"
+#include "compute_kernel_writer/include/ckw/types/DataType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Convert the Compute Library data type to Compute Kernel Writer data type
+ *
+ * @param[in] dt The Compute Library data type
+ *
+ * @return the Compute Kernel Writer data type (ckw::DataType)
+ */
+ckw::DataType to_ckw(DataType dt);
+
+/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape
+ *
+ * @param[in] shape The Compute Library tensor shape
+ *
+ * @return the Compute Kernel Writer tensor shape (ckw::TensorShape)
+ */
+ckw::TensorShape to_ckw(const TensorShape &shape);
+
+/** Convert the Compute Library data layout to Compute Kernel Writer data layout
+ *
+ * @param[in] dl The Compute Library data layout
+ *
+ * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout)
+ */
+ckw::TensorDataLayout to_ckw(DataLayout dl);
+
+/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info
+ *
+ * @param[in] tensor_info The Compute Library tensor info
+ *
+ * @return the Compute Kernel Writer tensor info (ckw::TensorInfo)
+ */
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info);
+
+/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage
+ *
+ * @param[in] storage The Compute Library tensor storage
+ *
+ * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType)
+ */
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage);
+
+/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component
+ *
+ * @param[in] component The Compute Kernel Writer tensor component
+ *
+ * @return the Compute Library tensor component
+ */
+TensorComponentType from_ckw(const ckw::TensorComponentType &component);
+
+/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage
+ *
+ * @param[in] storage The Compute Kernel Writer tensor storage
+ *
+ * @return the Compute Library tensor storage
+ */
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
new file mode 100644
index 0000000000..5630e390d5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
+{
+    switch (attributes.operation())
+    {
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
+            return ckw::BinaryOp::Add;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
+            return ckw::BinaryOp::Sub;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
+            return ckw::BinaryOp::Div;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
+            return ckw::BinaryOp::Mul;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
+        default:
+            ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
new file mode 100644
index 0000000000..644a407702
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+#include "compute_kernel_writer/include/ckw/types/Operators.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
new file mode 100644
index 0000000000..ee109a7e2b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "Types.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids
+ */
+class GpuKernelComponentFactory
+{
+public:
+    /** Create a new kernel component
+     *
+     * @tparam T      Any polymorphic type descending from @ref IGpuKernelComponent
+     * @tparam Args   Argument types to construct the kernel component
+     *
+     * @param[in] args Arguments to construct the kernel component
+     *
+     * @return std::unique_ptr<IGpuKernelComponent>
+     */
+    template <typename T, typename... Args>
+    std::unique_ptr<IGpuKernelComponent> create(Args &&...args)
+    {
+        return std::make_unique<T>(_count++, std::forward<Args>(args)...);
+    }
+
+private:
+    ComponentId _count{0};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
new file mode 100644
index 0000000000..6678c929e9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+#include "Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Properties common to all kernel component types */
+class KernelProperties
+{
+public:
+    KernelProperties &stage(const UnitWorkloadStage &stage)
+    {
+        _stage = stage;
+        return *this;
+    }
+    UnitWorkloadStage stage() const
+    {
+        return _stage;
+    }
+
+private:
+    UnitWorkloadStage _stage{};
+};
+
+inline bool operator==(const KernelProperties &config0, const KernelProperties &config1)
+{
+    return config0.stage() == config1.stage();
+}
+
+/** Forward declaration */
+class IGpuTemplateComponentWriter;
+class IGpuCkwComponentDriver;
+
+/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion
+ */
+class IGpuKernelComponent
+{
+public:
+    using Properties = KernelProperties;
+
+public:
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] properties Kernel component properties
+     * @param[in] tensors    Tensor arguments to the components
+     */
+    IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+        : _id{id}, _properties{properties}, _tensors{tensors}
+    {
+    }
+    /** Destructor */
+    virtual ~IGpuKernelComponent()
+    {
+    }
+    /** Get component id */
+    ComponentId id() const
+    {
+        return _id;
+    }
+    /** Get tensor arguments */
+    ArgumentPack<ITensorInfo> tensors() const
+    {
+        return _tensors;
+    }
+    /** Get properties */
+    Properties properties() const
+    {
+        return _properties;
+    }
+    /** Get writer for the component */
+    virtual const IGpuCkwComponentDriver *ckw_component_driver() const
+    {
+        return nullptr;
+    }
+    /** Get component type */
+    virtual GpuComponentType type() const = 0;
+
+private:
+    ComponentId               _id{-1};
+    Properties                _properties{};
+    ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h
new file mode 100644
index 0000000000..54b3a69057
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/Types.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a kernel component within a workload
+ */
+using ComponentId = int32_t;
+
+/** Component type in the context of fusion
+ *  Its main purpose is to inform the optimizer how to perform fusion.
+ */
+enum class GpuComponentType
+{
+    Complex,
+    Simple,
+    Unfusable,
+    Output
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
new file mode 100644
index 0000000000..e316bdf46d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentActivation.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentActivation::validate(const Properties                &properties,
+                                       const ArgumentPack<ITensorInfo> &tensors,
+                                       const Attributes                &attributes)
+{
+    ARM_COMPUTE_UNUSED(properties, attributes);
+
+    const ITensorInfo *const src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    const ITensorInfo *const dst = tensors.get_const_tensor(TensorType::ACL_DST);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+
+    return Status{};
+}
+
+ClComponentActivation::ClComponentActivation(ComponentId                            id,
+                                             const IGpuKernelComponent::Properties &properties,
+                                             const ArgumentPack<ITensorInfo>       &tensors,
+                                             const Attributes                      &attributes)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
+{
+}
+
+ClComponentActivation::~ClComponentActivation()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
new file mode 100644
index 0000000000..b8185158f3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class GpuCkwActivation;
+
+class ClComponentActivation final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = ActivationLayerInfo;
+
+    /** Validate the component
+     *
+     * @param[in]      properties Component properties @ref Properties
+     * @param[in, out] tensors    Tensor arguments to the component
+     * @param[in]      attributes Component attributes @ref Attributes
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC: Input
+     * - ACL_DST: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC: Const
+     * - ACL_DST: Const
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |ACL_SRC        |ACL_DST        |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     */
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentActivation::validate()
+     */
+    ClComponentActivation(ComponentId                      id,
+                          const Properties                &properties,
+                          const ArgumentPack<ITensorInfo> &tensors,
+                          const Attributes                &attributes);
+
+    /** Destructor */
+    ~ClComponentActivation() override;
+
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentActivation(const ClComponentActivation &component) = delete;
+
+    /** Prevent instances of this class from being copied */
+    ClComponentActivation &operator=(const ClComponentActivation &component) = delete;
+
+    /** Allow instances of this class to be move constructed */
+    ClComponentActivation(ClComponentActivation &&component) = default;
+
+    /** Allow instances of this class to be moved */
+    ClComponentActivation &operator=(ClComponentActivation &&component) = default;
+
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Simple;
+    }
+
+private:
+    std::unique_ptr<GpuCkwActivation> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
new file mode 100644
index 0000000000..e1850d78c4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentCast.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentCast::validate(const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes,
+                                 const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties, attributes, settings);
+
+    const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(),
+                                    "input and target data types should be different");
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(),
+                                        "dst and target data types should be same");
+    }
+
+    return Status{};
+}
+ClComponentCast::ClComponentCast(ComponentId                      id,
+                                 const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes,
+                                 const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
+{
+    ARM_COMPUTE_UNUSED(attributes, settings);
+}
+
+ClComponentCast::~ClComponentCast()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
new file mode 100644
index 0000000000..201dacc288
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Component specific settings
+ */
+class ClComponentCastSettings
+{
+public:
+private:
+};
+
+/** Forward declaration */
+class GpuCkwCast;
+
+class ClComponentCast final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = CastAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = ClComponentCastSettings;
+
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties @ref Properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes @ref Attributes
+     * @param[in]     settings   Component settings @ref Settings
+     *
+     * @return Status        Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - All
+     *
+     ** Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0                              |
+     * |:--------------|:--------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
+     * |F16            | U8, S8, U16, S16, U32, S32, F32       |
+     * |F32            | U8, S8, U16, S16, U32, S32, F16       |
+     */
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentCast::validate()
+     */
+    ClComponentCast(ComponentId                      id,
+                    const Properties                &properties,
+                    const ArgumentPack<ITensorInfo> &tensors,
+                    const Attributes                &attributes,
+                    const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentCast() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentCast(const ClComponentCast &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentCast &operator=(const ClComponentCast &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentCast(ClComponentCast &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentCast &operator=(ClComponentCast &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Simple;
+    }
+
+private:
+    std::unique_ptr<GpuCkwCast> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..7cd23d6115
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDepthwiseConv2d.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using Settings = ClComponentDepthwiseConv2dSettings;
+
+Settings &Settings::export_input_to_cl_image(bool cl_image)
+{
+    _export_input_to_cl_image = cl_image;
+    return *this;
+}
+
+bool Settings::export_input_to_cl_image() const
+{
+    return _export_input_to_cl_image;
+}
+
+Settings &Settings::export_weights_to_cl_image(bool cl_image)
+{
+    _export_weights_to_cl_image = cl_image;
+    return *this;
+}
+
+bool Settings::export_weights_to_cl_image() const
+{
+    return _export_weights_to_cl_image;
+}
+
+Settings &Settings::fast_relaxed_math(bool fast_relaxed_math)
+{
+    _fast_relaxed_math = fast_relaxed_math;
+    return *this;
+}
+
+bool Settings::fast_relaxed_math() const
+{
+    return _fast_relaxed_math;
+}
+
+Settings &Settings::is_fma_available(bool is_fma_available)
+{
+    _is_fma_available = is_fma_available;
+    return *this;
+}
+
+bool Settings::is_fma_available() const
+{
+    return _is_fma_available;
+}
+
+Settings &Settings::n0(unsigned int n0)
+{
+    _n0 = n0;
+    return *this;
+}
+
+unsigned int Settings::n0() const
+{
+    return _n0;
+}
+
+Settings &Settings::m0(unsigned int m0)
+{
+    _m0 = m0;
+    return *this;
+}
+
+unsigned int Settings::m0() const
+{
+    return _m0;
+}
+
+Status ClComponentDepthwiseConv2d::validate(const Properties                &properties,
+                                            const ArgumentPack<ITensorInfo> &tensors,
+                                            const Attributes                &attributes,
+                                            const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties, settings);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+    // 1. Check validity
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+    }
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+    }
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+    }
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    // wei shape is correct
+    const DataLayout data_layout = src->data_layout();
+    const size_t     channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * attributes.depth_multiplier()));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional");
+
+    // dst shape is correct
+    const PadStrideInfo pad_stride_info =
+        PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right,
+                      attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type());
+    const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                    attributes.dilation()};
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+
+    // Check strides and dilation
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1);
+
+    if (conv_info.depth_multiplier > 1 && settings.n0() > 1)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0);
+    }
+
+    // Check export weights to cl image
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) &&
+                                        (export_to_cl_image(wei) == false),
+                                    "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * conv_info.depth_multiplier));
+
+    // bia shape is correct
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx],
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    // Texture in the input tensor
+    ARM_COMPUTE_RETURN_ERROR_ON((settings.export_input_to_cl_image() == true));
+
+    return Status{};
+}
+
+ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId                      id,
+                                                       const Properties                &properties,
+                                                       const ArgumentPack<ITensorInfo> &tensors,
+                                                       const Attributes                &attributes,
+                                                       const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwDepthwiseConv2d>(id, tensors, attributes, settings)}
+{
+    ARM_COMPUTE_UNUSED(attributes, settings);
+}
+ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()
+{
+}
+const IGpuCkwComponentDriver *ClComponentDepthwiseConv2d::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
new file mode 100644
index 0000000000..7526361f1c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
+
+#include "arm_compute/core/Error.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class DepthwiseConv2dAttributes;
+
+/** Forward declaration */
+class GpuCkwDepthwiseConv2d;
+
+/** Component specific settings
+ */
+class ClComponentDepthwiseConv2dSettings
+{
+public:
+    /** Set export_input_to_cl_image flag */
+    ClComponentDepthwiseConv2dSettings &export_input_to_cl_image(bool cl_image);
+    /** Get export_input_to_cl_image flag */
+    bool export_input_to_cl_image() const;
+
+    /** Set export_weights_to_cl_image flag */
+    ClComponentDepthwiseConv2dSettings &export_weights_to_cl_image(bool cl_image);
+    /** Get export_weights_to_cl_image flag */
+    bool export_weights_to_cl_image() const;
+
+    /** Set fast_relaxed_math flag */
+    ClComponentDepthwiseConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+    /** Get fast_relaxed_math flag */
+    bool fast_relaxed_math() const;
+
+    /** Set is_fma_available flag */
+    ClComponentDepthwiseConv2dSettings &is_fma_available(bool is_fma_available);
+    /** Get is_fma_available flag */
+    bool is_fma_available() const;
+
+    /** Set N0: number of columns processed by each thread */
+    ClComponentDepthwiseConv2dSettings &n0(unsigned int n0);
+    /** Get N0: number of columns processed by each thread */
+    unsigned int n0() const;
+
+    /** Set M0: number of rows processed by each thread */
+    ClComponentDepthwiseConv2dSettings &m0(unsigned int m0);
+    /** Set M0: number of rows processed by each thread */
+    unsigned int m0() const;
+
+private:
+    bool         _export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
+    bool         _fast_relaxed_math{true};           /**< Enable/disable -cl-fast-relaxed-math flag */
+    bool         _is_fma_available{false};           /**< Is fma instruction available */
+    unsigned int _n0{0};                             /**< Number of columns processed by each thread */
+    unsigned int _m0{0};                             /**< Number of rows processed by each thread */
+};
+
+/** Forward declaration */
+class ClTemplateDepthwiseConv2d;
+
+class ClComponentDepthwiseConv2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = DepthwiseConv2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = ClComponentDepthwiseConv2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties @ref Properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes @ref Attributes
+     * @param[in]     settings   Component settings @ref Settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_SRC_1: Weight
+     * - ACL_SRC_2: Bias (Optional)
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_SRC_2: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_SRC_2      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     */
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDepthwiseConv2d::validate()
+     */
+    ClComponentDepthwiseConv2d(ComponentId                      id,
+                               const Properties                &properties,
+                               const ArgumentPack<ITensorInfo> &tensors,
+                               const Attributes                &attributes,
+                               const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentDepthwiseConv2d() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentDepthwiseConv2d(const ClComponentDepthwiseConv2d &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentDepthwiseConv2d &operator=(const ClComponentDepthwiseConv2d &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<GpuCkwDepthwiseConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
new file mode 100644
index 0000000000..783a17df30
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool ClComponentDirectConv2dSettings::export_to_cl_image() const
+{
+    return _desc.export_weights_to_cl_image;
+}
+
+ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math)
+{
+    _fast_relaxed_math = fast_relaxed_math;
+    return *this;
+}
+
+bool ClComponentDirectConv2dSettings::fast_relaxed_math() const
+{
+    return _fast_relaxed_math;
+}
+
+ClComponentDirectConv2dSettings &
+ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
+{
+    _desc = desc;
+    return *this;
+}
+
+DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descriptor() const
+{
+    return _desc;
+}
+
+Status ClComponentDirectConv2d::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes,
+                                         const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+    // 1. Check validity
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+    }
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+    }
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+    }
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    // wei shape is correct
+    const DataLayout data_layout = src->data_layout();
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    // dst shape is correct
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                    attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                                    DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+
+    // bia shape is correct
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    const auto desc = settings.direct_conv_descriptor();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
+                                    "N0 can only be: 1, 2, 3, 4, 8, and 16");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
+                                    "K0 can only be: 1, 2, 3, 4, 8, and 16");
+    return Status{};
+}
+
+ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId                      id,
+                                                 const Properties                &properties,
+                                                 const ArgumentPack<ITensorInfo> &tensors,
+                                                 const Attributes                &attributes,
+                                                 const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
+{
+}
+
+ClComponentDirectConv2d::~ClComponentDirectConv2d()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
new file mode 100644
index 0000000000..c50b0fa0ce
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Conv2dAttributes;
+
+/** Component specific settings
+ */
+class ClComponentDirectConv2dSettings
+{
+public:
+    /** Get export_to_cl_image flag */
+    bool export_to_cl_image() const;
+
+    /** Set fast_relaxed_math flag */
+    ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+    /** Get fast_relaxed_math flag */
+    bool fast_relaxed_math() const;
+
+    /** Set direct convolution descriptor */
+    ClComponentDirectConv2dSettings &direct_conv_descriptor(const DirectConvComputeKernelInfo &desc);
+    /** Get direct convolution descriptor */
+    DirectConvComputeKernelInfo direct_conv_descriptor() const;
+
+private:
+    bool                        _fast_relaxed_math{true};
+    DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor
+};
+
+/** Forward declaration */
+class GpuCkwDirectConv2d;
+
+class ClComponentDirectConv2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = Conv2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = ClComponentDirectConv2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_SRC_1: Weight
+     * - ACL_SRC_2: Bias (Optional)
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_SRC_2: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_SRC_2      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     */
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     */
+    ClComponentDirectConv2d(ComponentId                      id,
+                            const Properties                &properties,
+                            const ArgumentPack<ITensorInfo> &tensors,
+                            const Attributes                &attributes,
+                            const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentDirectConv2d() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
new file mode 100644
index 0000000000..209c73dbee
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentElementwiseBinary.h"
+
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops{
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul};
+}
+
+Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo>         &tensors,
+                                              const ElementwiseBinaryCommonAttributes &attributes)
+{
+    const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    // Check operator type
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(),
+                                    "Provided Elementwise operation not supported.");
+
+    // Check validity
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+    //Check data type for different elementwise operators
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32,
+                                                         DataType::S16, DataType::U8);
+
+    // dst shape is correct
+    const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                    "Wrong shape for dst.");
+
+    const auto &lhs_shape = lhs->tensor_shape();
+    const auto &rhs_shape = rhs->tensor_shape();
+    const auto &dst_shape = dst->tensor_shape();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) &&
+                                        detail::have_different_dimensions(rhs_shape, dst_shape, 0),
+                                    "Only LHS or RHS can be broadcasting, not both.");
+
+    // Dimension Y and Z are collapsed together in the current kernel implementation,
+    // hence they cannot be independently broadcast or non-broadcast.
+    // See: ClTemplateElementwiseBinary::get_window
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) !=
+                                        (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
+                                    "Dimension Y and Z must both be either broadcast or non-broadcast.");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3),
+                                    "LHS broadcast in dimension 3 or higher is not supported.");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3),
+                                    "RHS broadcast in dimension 3 or higher is not supported.");
+
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+    return Status{};
+}
+
+ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
+{
+}
+ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId                      id,
+                                                           const Properties                &properties,
+                                                           const ArgumentPack<ITensorInfo> &tensors,
+                                                           const Attributes                &attributes)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
new file mode 100644
index 0000000000..a4395a6219
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class GpuCkwElementwiseBinary;
+
+class ClComponentElementwiseBinary final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = ElementwiseBinaryCommonAttributes;
+
+public:
+    /** Validate the component
+     *
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: lhs
+     * - ACL_SRC_1: rhs
+     * - ACL_DST_0: dst
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations (for DIV FP32/FP16/S32 supported, for POWER only FP32/FP16 supported):
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |U8             |U8             |U8             |
+     */
+    static Status validate(const ArgumentPack<ITensorInfo>         &tensors,
+                           const ElementwiseBinaryCommonAttributes &attributes);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentElementwiseBinary::validate()
+     */
+    ClComponentElementwiseBinary(ComponentId                      id,
+                                 const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes);
+
+    /** Destructor */
+    ~ClComponentElementwiseBinary() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentElementwiseBinary(const ClComponentElementwiseBinary &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentElementwiseBinary &operator=(const ClComponentElementwiseBinary &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Simple;
+    }
+
+private:
+    std::unique_ptr<GpuCkwElementwiseBinary> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
new file mode 100644
index 0000000000..53ac8da41f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+using Attributes = MatMulAttributes;
+using Settings   = GpuMatMulSettings;
+
+Status validate_matmul_kernel_info(Attributes attributes, Settings settings)
+{
+    const bool adj_lhs = attributes.adj_lhs();
+    const bool adj_rhs = attributes.adj_rhs();
+    const int  m0      = settings.m0();
+    const int  n0      = settings.n0();
+    const int  k0      = settings.k0();
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+    if (!adj_lhs || adj_rhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
+    }
+
+    return Status{};
+}
+
+} // namespace
+
+Status ClComponentMatMul::validate(const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties);
+    ARM_COMPUTE_UNUSED(attributes);
+
+    const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    // Currently, the only supported case is when adj_lhs = false and adj_rhs = true
+    ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true));
+
+    // Check if Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+    // Check if block sizes are supported
+    MatMulKernelInfo matmul_kernel_info =
+        MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0());
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+
+    // Check if dst shape is correct
+    const auto expected_dst_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape);
+
+    return Status{};
+}
+
+ClComponentMatMul::ClComponentMatMul(ComponentId                      id,
+                                     const Properties                &properties,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes,
+                                     const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwMatMul>(id, tensors, attributes, settings)}
+{
+}
+
+ClComponentMatMul::~ClComponentMatMul()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentMatMul::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h
new file mode 100644
index 0000000000..41833e4adb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class MatMulAttributes;
+class GpuCkwMatMul;
+
+class ClComponentMatMul final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = MatMulAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = GpuMatMulSettings;
+
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: LHS
+     * - ACL_SRC_1: RHS
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_SRC_1: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_SRC_1      |ACL_DST_0      |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     */
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentMatMul::validate()
+     */
+    ClComponentMatMul(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes,
+                      const Settings                  &settings);
+    /** Destructor */
+    ~ClComponentMatMul() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentMatMul(const ClComponentMatMul &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentMatMul &operator=(const ClComponentMatMul &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentMatMul(ClComponentMatMul &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentMatMul &operator=(ClComponentMatMul &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<GpuCkwMatMul> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
new file mode 100644
index 0000000000..6e7243dc04
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentPool2d::validate(const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
+{
+    ARM_COMPUTE_UNUSED(properties, settings);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX),
+                             "Unsupported Pooling type");
+
+    // 1. Check validity
+    // Check if pooling is valid
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, true)),
+        "Pooling region that is entirely outside input tensor is unsupported");
+
+    // Matching data type
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    // Matching data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    // Device requirements are met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(),
+        misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, true)));
+
+    // 2. Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    return Status{};
+}
+
+ClComponentPool2d::ClComponentPool2d(ComponentId                      id,
+                                     const Properties                &properties,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes,
+                                     const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
+{
+}
+ClComponentPool2d::~ClComponentPool2d()
+{
+}
+const IGpuCkwComponentDriver *ClComponentPool2d::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
new file mode 100644
index 0000000000..d33e601f18
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Pool2dAttributes;
+
+/** Forward declaration */
+class GpuCkwPool2d;
+
+class ClComponentPool2d final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = Pool2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+    using Settings = GpuPool2dSettings;
+
+public:
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     */
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+
+    /** Constructor
+     *
+     * @param[in]     id         Unique Component Identifier within a workload
+     * @param[in]     properties Component properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes
+     * @param[in]     settings   Component settings
+     */
+    ClComponentPool2d(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes,
+                      const Settings                  &settings);
+
+    /** Destructor */
+    ~ClComponentPool2d() override;
+
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentPool2d(const ClComponentPool2d &component) = delete;
+
+    /** Prevent instances of this class from being copied */
+    ClComponentPool2d &operator=(const ClComponentPool2d &component) = delete;
+
+    /** Allow instances of this class to be move constructed */
+    ClComponentPool2d(ClComponentPool2d &&component) = default;
+
+    /** Allow instances of this class to be moved */
+    ClComponentPool2d &operator=(ClComponentPool2d &&component) = default;
+
+    /** Get GPU kernel writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<GpuCkwPool2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
new file mode 100644
index 0000000000..dce85c424e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentReshape.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors)
+{
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
+
+    return Status{};
+}
+
+ClComponentReshape::ClComponentReshape(ComponentId                      id,
+                                       const Properties                &properties,
+                                       const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors}
+{
+}
+ClComponentReshape::~ClComponentReshape()
+{
+}
+const IGpuCkwComponentDriver *ClComponentReshape::ckw_component_driver() const
+{
+    /* NOT IMPLEMENTED */
+    return nullptr;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
new file mode 100644
index 0000000000..fd0f966da1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class ClTemplateReshape;
+
+class ClComponentReshape final : public IGpuKernelComponent
+{
+public:
+public:
+    /** Validate the component
+     *
+     * @param[in,out] tensors Tensor arguments to the component
+     *
+     * @return Status       Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: src
+     * - ACL_DST_0: dst
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * - All
+     */
+    static Status validate(const ArgumentPack<ITensorInfo> &tensors);
+
+    /** Constructor
+     *
+     * @param[in] id         Component id
+     * @param[in] properties Component properties @ref Properties
+     * @param[in] tensors    Tensor arguments to the component
+     */
+    ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+
+    /** Destructor */
+    ~ClComponentReshape() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentReshape(const ClComponentReshape &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentReshape &operator=(const ClComponentReshape &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentReshape(ClComponentReshape &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentReshape &operator=(ClComponentReshape &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
new file mode 100644
index 0000000000..411eeca802
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ClComponentResize.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuCkwResize;
+
+Status ClComponentResize::validate(const IGpuKernelComponent::Properties &properties,
+                                   const ArgumentPack<ITensorInfo>       &tensors,
+                                   const ClComponentResize::Attributes   &attributes)
+{
+    ARM_COMPUTE_UNUSED(properties);
+
+    const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+    // Mismatching data types and quantization info
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+
+    // Device requirements met
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    // Align corners and sampling policy conformance
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        attributes.align_corners() &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
+
+    // All tensor infos are initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+    return Status();
+}
+
+ClComponentResize::ClComponentResize(ComponentId                            id,
+                                     const IGpuKernelComponent::Properties &properties,
+                                     const ArgumentPack<ITensorInfo>       &tensors,
+                                     const ClComponentResize::Attributes   &attributes)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
+{
+}
+
+ClComponentResize::~ClComponentResize()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
new file mode 100644
index 0000000000..9a1169c45f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class GpuCkwResize;
+
+class ClComponentResize final : public IGpuKernelComponent
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what a component does */
+    using Attributes = ResizeAttributes;
+
+    /** Validate the component
+     *
+     * @param[in]     properties Component properties @ref Properties
+     * @param[in,out] tensors    Tensor arguments to the component
+     * @param[in]     attributes Component attributes @ref Attributes
+     *
+     * @return Status        Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     ** Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |U8             |U8             |
+     * |S16            |S16            |
+     */
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+    /** Constructor
+     *
+     * Similar to @ref ClComponentResize::validate()
+     */
+    ClComponentResize(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes);
+
+    /** Destructor */
+    ~ClComponentResize() override;
+
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentResize(const ClComponentResize &component) = delete;
+
+    /** Prevent instances of this class from being copied */
+    ClComponentResize &operator=(const ClComponentResize &component) = delete;
+
+    /** Allow instances of this class to be move constructed */
+    ClComponentResize(ClComponentResize &&component) = default;
+
+    /** Allow instances of this class to be moved */
+    ClComponentResize &operator=(ClComponentResize &&component) = default;
+
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Complex;
+    }
+
+private:
+    std::unique_ptr<GpuCkwResize> _component_writer;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
new file mode 100644
index 0000000000..3db6c5cd2d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentStore.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentStore::validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+{
+    ARM_COMPUTE_UNUSED(properties, tensors);
+    return Status{};
+}
+ClComponentStore::ClComponentStore(ComponentId                      id,
+                                   const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
+{
+}
+ClComponentStore::~ClComponentStore()
+{
+}
+const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
+{
+    return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
new file mode 100644
index 0000000000..2c1dd0f6fc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class GpuCkwStore;
+
+class ClComponentStore final : public IGpuKernelComponent
+{
+public:
+    /** Validate the component
+     *
+     * @param[in] properties Component properties
+     * @param[in] tensors    Tensor arguments to the components
+     *
+     * @return Status        Validation results
+     *
+     * Tensor argument names:
+     * - ACL_SRC_0: Input
+     * - ACL_DST_0: Output
+     *
+     * Tensor argument constness:
+     * - ACL_SRC_0: Const
+     * - ACL_DST_0: Const
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |ACL_SRC_0      |ACL_DST_0      |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     */
+    static Status validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+    /** Constructor
+     *
+     * Similar to @ref ClComponentStore::validate()
+     */
+    ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+    /** Destructor */
+    ~ClComponentStore() override;
+    /** Prevent instances of this class from being copy constructed */
+    ClComponentStore(const ClComponentStore &component) = delete;
+    /** Prevent instances of this class from being copied */
+    ClComponentStore &operator=(const ClComponentStore &component) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClComponentStore(ClComponentStore &&component) = default;
+    /** Allow instances of this class to be moved */
+    ClComponentStore &operator=(ClComponentStore &&component) = default;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    /** Get component type */
+    GpuComponentType type() const override
+    {
+        return GpuComponentType::Output;
+    }
+
+private:
+    std::unique_ptr<GpuCkwStore> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
new file mode 100644
index 0000000000..4c3e84e59d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY
+
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+
+#include <ostream>
+#include <sstream>
+#include <string>
+
+namespace arm_compute
+{
+/** Type printers for all types related to the component @ref ClComponentElementwiseBinary
+ */
+
+using namespace experimental::dynamic_fusion;
+
+/** Formatted output of the pute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type.
+ *
+ * @param[out] os Output stream.
+ * @param[in]  op arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type to output.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
+{
+    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name = {
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}};
+    os << op_name.at(op);
+    return os;
+}
+/** Formatted output of the arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type.
+ *
+ * @param[in] op arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type to output.
+ *
+ * @return Formatted string.
+ */
+inline std::string to_string(const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
+{
+    std::stringstream str;
+    str << op;
+    return str.str();
+}
+} // namespace arm_compute
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
new file mode 100644
index 0000000000..201c9f243c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Add then call the elementwise common validate_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
+    return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Add then call the elementwise common is_supported_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
+    return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
+{
+    // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
+    // Set the elementwise operation to Add then call the elementwise common create_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
+    return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
new file mode 100644
index 0000000000..d25a2a3153
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *src,
+                              const ITensorInfo        *dst,
+                              const CastAttributes     &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    auto_init_if_empty(dst_info_to_validate, src->clone()->set_data_type(attributes.data_type()));
+
+    // Check support level
+    // Data Type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::F16, DataType::F32);
+
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate Cast Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentCast::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+
+    return Status{};
+}
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status
+GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes)
+{
+    return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    auto_init_if_empty(dst_info_to_validate, src->clone()->set_data_type(attributes.data_type()));
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, attributes);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuCast::validate_op(sketch, src, attributes));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor info if empty
+    auto_init_if_empty(*dst, src->clone()->set_data_type(attributes.data_type()));
+
+    // Translate into components and add to component graph
+    GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+    const auto              *sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
+
+        // Add Depthwise Conv2d Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentCast>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
new file mode 100644
index 0000000000..4d6e7f81bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *src,
+                              const ITensorInfo        *dst,
+                              const ClampAttributes    &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(),
+                                    "Maximum clamp value cannot be lower than minimum value");
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        // Validate Activation Component
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC, src);
+        arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr);
+        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status
+GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes)
+{
+    return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    // Check if tensors have valid id, i.e. they are created from a sketch
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    // Perform fusion test to check if the operator meets fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC, src);
+    tensors.add_const_tensor(ACL_DST, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, attributes);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuClamp::validate_op(sketch, src, attributes));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor
+    auto_init_if_empty(*dst, *src->clone());
+
+    // Translate into components and add to component graph
+    GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+
+    // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
+
+    const auto *const sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        // Add Activation Component
+        auto properties = IGpuKernelComponent::Properties();
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC, src);
+        arguments.add_const_tensor(ACL_DST, dst);
+        comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC, src);
+    tensors.add_const_tensor(ACL_DST, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
new file mode 100644
index 0000000000..aaeec543f8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+{
+    // Get GPU target
+    GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t =
+        arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
+
+    return t->configure(src, weights, conv_info);
+}
+
+void calculate_and_init_dst_if_empty(ITensorInfo            *dst,
+                                     const ITensorInfo      *src,
+                                     const ITensorInfo      *wei,
+                                     const Conv2dAttributes &attributes)
+{
+    if (dst->total_size() == 0U)
+    {
+        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(
+            src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
+            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                          attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                          DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+*  when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *src,
+                              const ITensorInfo        *wei,
+                              const ITensorInfo        *bia,
+                              const ITensorInfo        *dst,
+                              const Conv2dAttributes   &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    // Check components
+    const auto gpu_target = context.gpu_target();
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate Direct Conv2d Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDirectConv2d::Settings();
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuConv2d::is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const ITensorInfo        *wei,
+                                  const ITensorInfo        *bia,
+                                  const Conv2dAttributes   &attributes)
+{
+    return is_supported_op_helper(context, src, wei, bia, nullptr, attributes);
+}
+
+Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const Conv2dAttributes  &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
+
+    // Check if tensors have valid id. I.e. they are created from a sketch
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+    }
+
+    // This tensor info will have invalid id but because all the existing tensors in the
+    // sketch have valid ids and the DependencyGraph implementation has no notion of validness
+    // regarding tensor ids, it'll be just another tensor id and will validate
+    // Additionally, a new dst id is added every time in create_op, thus there's no need to validate it
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+    // Perform fusion test
+    // Check if operator meets fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuConv2d::create_op(
+    GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
+    PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                            DimensionRoundingType::FLOOR);
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info);
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, attributes));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst);
+
+    // Auto initialize dst tensor
+    calculate_and_init_dst_if_empty(dst, src, wei, attributes);
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    const auto gpu_target = sketch_ctx->gpu_target();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+        ARM_COMPUTE_UNUSED(cl_compile_ctx);
+
+        // Add Direct Conv2d Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            auto settings = ClComponentDirectConv2d::Settings();
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+
+            settings.direct_conv_descriptor(desc);
+
+            if (settings.export_to_cl_image())
+            {
+                arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+            }
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..e2b673bd43
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo                     *dst,
+                                     const ITensorInfo               *src,
+                                     const ITensorInfo               *wei,
+                                     const DepthwiseConv2dAttributes &attributes)
+{
+    if (dst->total_size() == 0U)
+    {
+        const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                                            attributes.dimension_rounding_type());
+
+        const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                        attributes.dilation()};
+        const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+*  when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext        &context,
+                              const ITensorInfo               *src,
+                              const ITensorInfo               *wei,
+                              const ITensorInfo               *bia,
+                              const ITensorInfo               *dst,
+                              const DepthwiseConv2dAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+    const GpuTarget gpu_target = context.gpu_target();
+
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const CLCompileContext *cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Validate Depthwise Conv2d Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
+
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+            // Get the depthwise convolution compute parameters
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+
+            settings.fast_relaxed_math(
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
+
+            settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext        &context,
+                                           const ITensorInfo               *src,
+                                           const ITensorInfo               *wei,
+                                           const ITensorInfo               *bia,
+                                           const DepthwiseConv2dAttributes &attributes)
+{
+    return is_supported_op_helper(context, src, wei, bia, nullptr, attributes);
+}
+
+Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch         &sketch,
+                                       const ITensorInfo               *src,
+                                       const ITensorInfo               *wei,
+                                       const ITensorInfo               *bia,
+                                       const DepthwiseConv2dAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
+
+    if (bia != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+    }
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch               &sketch,
+                                           ITensorInfo                     *src,
+                                           ITensorInfo                     *wei,
+                                           ITensorInfo                     *bia,
+                                           const DepthwiseConv2dAttributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei);
+    ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuDepthwiseConv2d::validate_op(sketch, src, wei, bia, attributes));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    calculate_and_init_dst_if_empty(dst, src, wei, attributes);
+
+    // Translate into components and add to component graph
+    GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+    const auto              *sketch_ctx = sketch.implementation().context();
+    const GpuTarget          gpu_target = sketch_ctx->gpu_target();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
+
+        // Add Depthwise Conv2d Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
+
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+            // Get the depthwise convolution compute parameters
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+
+            settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+
+            if (settings.export_input_to_cl_image())
+            {
+                arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src);
+            }
+
+            if (settings.export_weights_to_cl_image())
+            {
+                arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+            }
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_SRC_1, wei);
+            arguments.add_const_tensor(ACL_SRC_2, bia);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentDepthwiseConv2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_SRC_1, wei);
+    tensors.add_const_tensor(ACL_SRC_2, bia);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
new file mode 100644
index 0000000000..2997b28ec1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
+                                     const ITensorInfo       *lhs,
+                                     const ITensorInfo       *rhs,
+                                     const MatMulAttributes  &attributes,
+                                     const GpuMatMulSettings &settings)
+{
+    ARM_COMPUTE_UNUSED(attributes);
+
+    if (dst->total_size() == 0U)
+    {
+        const auto dst_shape = misc::shape_calculator::compute_matmul_shape(
+            lhs->tensor_shape(), rhs->tensor_shape(),
+            MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0()));
+
+        auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(dst_shape));
+    }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+*  when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *lhs,
+                              const ITensorInfo        *rhs,
+                              const ITensorInfo        *dst,
+                              const MatMulAttributes   &attributes,
+                              const GpuMatMulSettings  &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings);
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        // Validate MatMul Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, lhs);
+            arguments.add_const_tensor(ACL_SRC_1, rhs);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentMatMul::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+int GpuMatMulSettings::n0() const
+{
+    return _n0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::n0(int n0)
+{
+    _n0 = n0;
+    return *this;
+}
+
+int GpuMatMulSettings::m0() const
+{
+    return _m0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::m0(int m0)
+{
+    _m0 = m0;
+    return *this;
+}
+
+int GpuMatMulSettings::k0() const
+{
+    return _k0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::k0(int k0)
+{
+    _k0 = k0;
+    return *this;
+}
+
+Status GpuMatMul::is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *lhs,
+                                  const ITensorInfo        *rhs,
+                                  const MatMulAttributes   &attributes,
+                                  const GpuMatMulSettings  &settings)
+{
+    return is_supported_op_helper(context, lhs, rhs, nullptr, attributes, settings);
+}
+
+Status GpuMatMul::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *lhs,
+                              const ITensorInfo       *rhs,
+                              const MatMulAttributes  &attributes,
+                              const GpuMatMulSettings &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+    // Check if tensors have valid id. I.e. they are created from a sketch
+    ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id());
+
+    // Refer to GpuMatmul::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings);
+
+    // Perform fusion test
+    // Check if operator meets fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, lhs);
+    tensors.add_const_tensor(ACL_SRC_1, rhs);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes, settings);
+}
+
+ITensorInfo *GpuMatMul::create_op(GpuWorkloadSketch &sketch,
+                                  ITensorInfo       *lhs,
+                                  ITensorInfo       *rhs,
+                                  const Attributes  &attributes,
+                                  const Settings    &settings)
+{
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, attributes, settings);
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuMatMul::validate_op(sketch, lhs, rhs, attributes, settings));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+    // Auto initialize dst tensor
+    calculate_and_init_dst_if_empty(dst, lhs, rhs, attributes, settings);
+
+    // Translate into components and add to component graph
+    auto      &comp_graph = sketch.implementation().component_graph();
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        auto properties = IGpuKernelComponent::Properties();
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC_0, lhs);
+        arguments.add_const_tensor(ACL_SRC_1, rhs);
+        arguments.add_const_tensor(ACL_DST_0, dst);
+        comp_graph.add_new_component<ClComponentMatMul>(properties, arguments, attributes, settings);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, lhs);
+    tensors.add_const_tensor(ACL_SRC_1, rhs);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
new file mode 100644
index 0000000000..b871171e8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Mul then call the elementwise common validate_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+    return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Mul then call the elementwise common is_supported_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+    return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
+{
+    // Set the elementwise operation to Mul then call the elementwise common create_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+    return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
new file mode 100644
index 0000000000..f0d368d757
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    // Initialize the destination tensor info.
+    TensorInfo dst_to_validate = *dst;
+    auto_init_if_empty(dst_to_validate, *src);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, &dst_to_validate);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, &dst_to_validate);
+
+    ARM_COMPUTE_UNUSED(context);
+    return Status{};
+}
+
+Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_alloc_tensor(dst));
+
+    // Initialize the destination tensor info.
+    TensorInfo dst_to_validate = *dst;
+    auto_init_if_empty(dst_to_validate, *src);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, &dst_to_validate);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, &dst_to_validate);
+
+    // Perform fusion test.
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_to_validate);
+
+    const auto group   = sketch.implementation().operator_group();
+    const auto op      = group.new_operator(operator_type, tensors);
+    const auto success = group.try_add_operator(op, true);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload.");
+
+    const auto status = is_supported_op(*sketch.gpu_context(), src, dst);
+    return status;
+}
+
+void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst));
+
+    // Auto initialize dst tensor info if empty
+    auto_init_if_empty(*dst, *src);
+
+    // Translate into components and add to component graph
+    auto      &comp_graph = sketch.implementation().component_graph();
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
+
+        // Add store component
+        {
+            IGpuKernelComponent::Properties properties;
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentStore>(properties, arguments);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op, true);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
new file mode 100644
index 0000000000..2d04f75610
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
+                                     const ITensorInfo       *src,
+                                     const Pool2dAttributes  &attributes,
+                                     const GpuPool2dSettings &settings)
+{
+    ARM_COMPUTE_UNUSED(settings);
+
+    if (dst->total_size() == 0U)
+    {
+        auto shape = misc::shape_calculator::compute_pool_shape(
+            *src, convert_pool_attr_to_pool_info(attributes, /* mixed_precision */ true));
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+    }
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+GpuPool2dSettings GpuPool2dSettings::use_inf_as_limit(bool use_inf_as_limit)
+{
+    _use_inf_as_limit = use_inf_as_limit;
+    return *this;
+}
+
+bool GpuPool2dSettings::use_inf_as_limit() const
+{
+    return _use_inf_as_limit;
+}
+
+Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const Pool2dAttributes  &attributes,
+                              const GpuPool2dSettings &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate;
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op(*sketch.gpu_context(), src, attributes, settings);
+}
+
+Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const Pool2dAttributes   &attributes,
+                                  const GpuPool2dSettings  &settings)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    // Check exclude padding is not false
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(),
+                                    "Exclude padding must be set to true in Attributes!");
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate;
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Validate Component
+        {
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentPool2d::validate(properties, arguments, attributes, settings));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
+                                  ITensorInfo             *src,
+                                  const Pool2dAttributes  &attributes,
+                                  const GpuPool2dSettings &settings)
+{
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings));
+    ARM_COMPUTE_LOG_PARAMS(src, attributes, settings);
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor
+    calculate_and_init_dst_if_empty(dst, src, attributes, settings);
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_UNUSED(cl_compile_ctx);
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentPool2d>(properties, arguments, attributes, settings);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
new file mode 100644
index 0000000000..0e1f16e8ff
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *src,
+                              const ITensorInfo        *dst,
+                              const ReshapeAttributes  &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Validate GpuReshape Component
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC_0, src);
+        arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentReshape::validate(arguments));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+
+    return Status{ErrorCode::RUNTIME_ERROR, "GpuReshape is not Supported"};
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status
+GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
+{
+    return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, attributes.shape());
+    ARM_COMPUTE_ERROR_THROW_ON(GpuReshape::validate_op(sketch, src, attributes));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(attributes.shape()));
+
+    // Translate into components and add to component graph
+    auto      &comp_graph = sketch.implementation().component_graph();
+    const auto sketch_ctx = sketch.implementation().context();
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_UNUSED(cl_compile_ctx);
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add ElementwiseBinary Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentReshape>(properties, arguments);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
new file mode 100644
index 0000000000..8e794c88b2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes)
+{
+    if (dst->total_size() == 0U)
+    {
+        TensorShape out_shape = src->tensor_shape();
+
+        out_shape.set(1, attributes.output_width());
+        out_shape.set(2, attributes.output_height());
+
+        auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+    }
+}
+
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+                              const ITensorInfo        *src,
+                              const ITensorInfo        *dst,
+                              const ResizeAttributes   &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes);
+
+    // Check support level
+    // Data type
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    // Data layout
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    // Interpolation policy
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                                        attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
+                                    "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR");
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Validate Activation Component
+        {
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentResize::validate(properties, arguments, attributes));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status
+GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
+{
+    return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status
+GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes);
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, attributes);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuResize::validate_op(sketch, src, attributes));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor info if empty
+    calculate_and_init_dst_if_empty(dst, src, attributes);
+
+    // Translate into components and add to component graph
+    GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+    const auto              *sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
+
+        // Add Resize Component
+        {
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, src);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentResize>(properties, arguments, attributes);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
new file mode 100644
index 0000000000..a2260c8c36
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        // Validate Activation Component
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC, src);
+        arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr);
+        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
+{
+    return is_supported_op_helper(context, src, nullptr);
+}
+
+Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    // Check if tensors have valid id, i.e. they are created from a sketch
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    // Perform fusion test to check if the operator meets fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC, src);
+    tensors.add_const_tensor(ACL_DST, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
+}
+
+ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuSigmoid::validate_op(sketch, src));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor
+    auto_init_if_empty(*dst, *src->clone());
+
+    // Translate into components and add to component graph
+    GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+    const auto *const sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        // Add Activation Component
+        auto properties = IGpuKernelComponent::Properties();
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC, src);
+        arguments.add_const_tensor(ACL_DST, dst);
+        comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC, src);
+    tensors.add_const_tensor(ACL_DST, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
new file mode 100644
index 0000000000..d385752201
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+GpuOperatorType operator_type = GpuOperatorType::Unfusable;
+} // namespace
+
+Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
+                                   const ITensorInfo        *src,
+                                   const ITensorInfo        *dst,
+                                   const Attributes         &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    if (dst != nullptr)
+    {
+        dst_info_to_validate = *dst;
+    }
+    else
+    {
+        auto_init_if_empty(dst_info_to_validate, *src->clone());
+    }
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+        const KernelProperties properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        TensorShape logits_sum_shape = src->tensor_shape();
+        TensorInfo  logits(src->clone()->set_tensor_shape(logits_sum_shape));
+
+        // The sum tensor dim0 only need one element
+        logits_sum_shape.set(0, 1);
+        TensorInfo sum(src->clone()->set_tensor_shape(logits_sum_shape));
+
+        // Validate Component
+        ArgumentPack<ITensorInfo> arguments_exp_sum;
+        ArgumentPack<ITensorInfo> arguments_norm;
+
+        arguments_exp_sum.add_const_tensor(ACL_SRC_0, src);
+        arguments_exp_sum.add_const_tensor(ACL_DST_0, &sum);
+        arguments_exp_sum.add_const_tensor(ACL_DST_1, &logits);
+
+        arguments_norm.add_const_tensor(ACL_SRC_0, &logits);
+        arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
+        arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+        ARM_COMPUTE_UNUSED(properties, attributes);
+        return Status(ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not implemented");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+
+    return Status{ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not Supported"};
+}
+
+Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch,
+                               const ITensorInfo       *src,
+                               const ITensorInfo       *dst,
+                               const Attributes        &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
+
+    // Auto initialize dst tensor info
+    TensorInfo dst_info_to_validate = *dst;
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    const size_t actual_axis =
+        static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
+    const bool needs_permute = actual_axis != 0;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet.");
+
+    // Perform fusion test and check if the operator meets the fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, attributes);
+    TensorShape  logits_sum_shape = src->tensor_shape();
+    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(
+                  src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    logits_sum_shape.set(0, 1);
+    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(
+        src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+
+    // Auto initialize dst tensor info and the auxiliary tensor infos as well
+    auto_init_if_empty(*dst, *src->clone());
+
+    // Assert validation
+    ARM_COMPUTE_ERROR_THROW_ON(GpuSoftmax::validate_op(sketch, src, dst, attributes));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(logits, sum);
+
+    // Translate into components and add to component graph
+    auto      &comp_graph = sketch.implementation().component_graph();
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+        ARM_COMPUTE_UNUSED(cl_compile_ctx);
+        ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Add Direct Conv2d Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments_exp_sum;
+            ArgumentPack<ITensorInfo> arguments_norm;
+
+            arguments_exp_sum.add_const_tensor(ACL_SRC_0, src);
+            arguments_exp_sum.add_const_tensor(ACL_DST_0, sum);
+            arguments_exp_sum.add_const_tensor(ACL_DST_1, logits);
+
+            arguments_norm.add_const_tensor(ACL_SRC_0, logits);
+            arguments_norm.add_const_tensor(ACL_SRC_1, sum);
+            arguments_norm.add_const_tensor(ACL_DST_0, dst);
+
+            // Add to component graph -- NOT IMPLEMENTED
+            ARM_COMPUTE_UNUSED(comp_graph, attributes);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, src);
+    tensors.add_const_tensor(ACL_DST_0, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
new file mode 100644
index 0000000000..c53453a15c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Sub then call the elementwise common validate_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+    return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+    // Set the elementwise operation to Sub then call the elementwise common is_supported_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+    return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
+{
+    // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
+    // Set the elementwise operation to Sub then call the elementwise common create_op
+    ElementwiseBinaryCommonAttributes common_attributes{};
+    common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+    return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
new file mode 100644
index 0000000000..b9d01966b3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        // Validate Activation Component
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC, src);
+        arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr);
+        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+    return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
+{
+    return is_supported_op_helper(context, src, nullptr);
+}
+
+Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+    // Check if tensors have valid id, i.e. they are created from a sketch
+    ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+    // Perform fusion test to check if the operator meets fusion constraints
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC, src);
+    tensors.add_const_tensor(ACL_DST, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
+}
+
+ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuTanh::validate_op(sketch, src));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor
+    auto_init_if_empty(*dst, *src->clone());
+
+    // Translate into components and add to component graph
+    GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
+
+    const auto *const sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        // Add Activation Component
+        auto properties = IGpuKernelComponent::Properties();
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+        ArgumentPack<ITensorInfo> arguments;
+        arguments.add_const_tensor(ACL_SRC, src);
+        arguments.add_const_tensor(ACL_DST, dst);
+        comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC, src);
+    tensors.add_const_tensor(ACL_DST, dst);
+
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
new file mode 100644
index 0000000000..d79a4c42c9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+    if (dst->total_size() == 0U)
+    {
+        const std::pair<TensorShape, ValidRegion> broadcast_pair =
+            ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
+        auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first));
+    }
+}
+
+Status is_supported_op_helper(const GpuWorkloadContext                &context,
+                              const ITensorInfo                       *lhs,
+                              const ITensorInfo                       *rhs,
+                              const ITensorInfo                       *dst,
+                              const ElementwiseBinaryCommonAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+    TensorInfo         dst_info_to_validate;
+    const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+    if (dst != nullptr)
+    {
+        dst_info_to_validate_ptr = dst;
+    }
+
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
+
+    // Check components
+    if (context.gpu_language() == GpuLanguage::OpenCL)
+    {
+        const auto cl_compile_ctx = context.cl_compile_context();
+        ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+        // Validate ElementwiseBinary Component
+        {
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, lhs);
+            arguments.add_const_tensor(ACL_SRC_1, rhs);
+            arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentElementwiseBinary::validate(arguments, attributes));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+    }
+
+    return Status{};
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+ElementwiseBinaryCommonAttributes &
+ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
+{
+    _operation = operation;
+    return *this;
+}
+
+ElementwiseBinaryCommonAttributes::ElementwiseOp ElementwiseBinaryCommonAttributes::operation() const
+{
+    return _operation;
+}
+
+Status GpuElementwiseBinaryCommon::is_supported_op(const GpuWorkloadContext                &context,
+                                                   const ITensorInfo                       *lhs,
+                                                   const ITensorInfo                       *rhs,
+                                                   const ElementwiseBinaryCommonAttributes &attributes)
+{
+    return is_supported_op_helper(context, lhs, rhs, nullptr, attributes);
+}
+
+Status GpuElementwiseBinaryCommon::validate_op(const GpuWorkloadSketch                 &sketch,
+                                               const ITensorInfo                       *lhs,
+                                               const ITensorInfo                       *rhs,
+                                               const ElementwiseBinaryCommonAttributes &attributes)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id());
+
+    // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+    TensorInfo dst_info_to_validate;
+
+    // Auto initialize dst tensor info
+    calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
+
+    // Perform fusion test
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, lhs);
+    tensors.add_const_tensor(ACL_SRC_1, rhs);
+    tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+                                    "Operator fusion test failed. This operator cannot be fused into the workload");
+
+    // Check if configuration is supported
+    return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch                       &sketch,
+                                                   ITensorInfo                             *lhs,
+                                                   ITensorInfo                             *rhs,
+                                                   const ElementwiseBinaryCommonAttributes &attributes)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs);
+    ARM_COMPUTE_ERROR_THROW_ON(GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, attributes));
+
+    ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+    // Auto initialize dst tensor
+    calculate_and_init_dst_if_empty(dst, lhs, rhs);
+
+    // Translate into components and add to component graph
+    auto &comp_graph = sketch.implementation().component_graph();
+
+    const auto sketch_ctx = sketch.implementation().context();
+
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
+
+        // Add ElementwiseBinary Component
+        {
+            auto properties = IGpuKernelComponent::Properties();
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+            ArgumentPack<ITensorInfo> arguments;
+            arguments.add_const_tensor(ACL_SRC_0, lhs);
+            arguments.add_const_tensor(ACL_SRC_1, rhs);
+            arguments.add_const_tensor(ACL_DST_0, dst);
+            comp_graph.add_new_component<ClComponentElementwiseBinary>(properties, arguments, attributes);
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+    }
+
+    // Set up fusion test by adding to the Operator Group
+    // Note this has to be performed after all the components have been successfully added to the component graph
+
+    // Pack tensor infos
+    ArgumentPack<ITensorInfo> tensors;
+    tensors.add_const_tensor(ACL_SRC_0, lhs);
+    tensors.add_const_tensor(ACL_SRC_1, rhs);
+    tensors.add_tensor(ACL_DST_0, dst);
+    const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+    sketch.implementation().operator_group().add_operator(op);
+
+    return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
new file mode 100644
index 0000000000..0b58b6eb96
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ElementwiseBinaryCommonAttributes
+{
+public:
+    enum class ElementwiseOp
+    {
+        Add,         /**< (x + y) */
+        Sub,         /**< (x - y) */
+        Div,         /**< (x / y) */
+        Mul,         /**< (x * y) */
+        Min,         /**< Min(x, y) */
+        Max,         /**< Max(x, y) */
+        SquaredDiff, /**< (x - y)^2 */
+        Power,       /**< x ^ y */
+        Prelu,       /**< y*x if x < 0, x otherwise */
+    };
+    /** Set operation*/
+    ElementwiseBinaryCommonAttributes &operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation);
+    /** Get operation*/
+    ElementwiseOp operation() const;
+
+private:
+    ElementwiseOp _operation; /**< Elementwise operation */
+};
+
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuElementwiseBinaryCommon final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations are checked at the operator level i.e. GpuAdd::validate_op(), GpuSub::validate_op(), ... etc.
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     lhs        Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in]     rhs        Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in]     attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch                       &sketch,
+                                  ITensorInfo                             *lhs,
+                                  ITensorInfo                             *rhs,
+                                  const ElementwiseBinaryCommonAttributes &attributes);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] lhs        Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in] rhs        Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext                &context,
+                                  const ITensorInfo                       *lhs,
+                                  const ITensorInfo                       *rhs,
+                                  const ElementwiseBinaryCommonAttributes &attributes);
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuElementwiseBinaryCommon::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch                 &sketch,
+                              const ITensorInfo                       *rhs,
+                              const ITensorInfo                       *lhs,
+                              const ElementwiseBinaryCommonAttributes &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON */
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
new file mode 100644
index 0000000000..c157c2b21c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+    return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+} // namespace
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph
+ *  Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ */
+class DependencyGraph
+{
+public:
+    using Id         = int32_t;
+    using TensorId   = Id;
+    using OperatorId = Id;
+    /** Adjacency list
+     *
+     */
+    using AdjList = std::map<Id, std::vector<Id>>;
+
+    /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+     *
+     */
+    struct OpPack
+    {
+        OperatorId            op{};
+        std::vector<TensorId> inputs{};
+        std::vector<TensorId> outputs{};
+        friend bool           operator==(const OpPack &opp0, const OpPack &opp1)
+        {
+            return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) ==
+                   std::make_tuple(opp1.op, opp1.inputs, opp1.outputs);
+        }
+    };
+
+public:
+    DependencyGraph() = default;
+    friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &);
+
+    /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list
+     *
+     * Rule: If the new operator is not the first operator, at least one input tensor must be
+     *       the output tensor of the last non-output operator. All other input tensors must be
+     *       the global input of the graph (i.e. not the output of any operator).
+     *
+     * Rule: The output tensor of the new operator must not be the input tensor of any previously
+     *       added operator.
+     *
+     * PRECONDITION: The current graph is already linear
+     *
+     * @return true  If the operator can be added while keeping the graph as a linear sequence
+     * @return false  Otherwise
+     */
+    bool try_add_operator_as_linear(OperatorId                   op,
+                                    const std::vector<TensorId> &inputs,
+                                    const std::vector<TensorId> &outputs,
+                                    bool                         is_output = false) const
+    {
+        ARM_COMPUTE_UNUSED(op, is_output);
+        if (all_ops().empty())
+        {
+            return true;
+        }
+
+        // If the new operator is not the first operator, at least one input tensor must be
+        // the output tensor of the last non-output operator. All other input tensors must be
+        // the global input of the graph (i.e. not the output of any operator).
+        if (_last_op_available)
+        {
+            auto use_input_from_last_op = false;
+
+            for (auto src_tensor : inputs)
+            {
+                const auto src_ops = _adj_src_ops.find(src_tensor);
+
+                if (src_ops != _adj_src_ops.end())
+                {
+                    ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1);
+
+                    if (!src_ops->second.empty())
+                    {
+                        const auto src_op = src_ops->second[0];
+
+                        if (src_op == _last_op)
+                        {
+                            if (use_input_from_last_op)
+                            {
+                                // To be safe, we also forbid using the output tensor
+                                // of the last operator twice.
+                                return false;
+                            }
+
+                            use_input_from_last_op = true;
+                        }
+                        else
+                        {
+                            // The input tensor of this operator must not be the output tensor
+                            // of any other operator except the last non-output operator.
+                            return false;
+                        }
+                    }
+                }
+            }
+
+            if (!use_input_from_last_op)
+            {
+                // At least one input tensor must be the output tensor of the last non-output operator.
+                return false;
+            }
+        }
+
+        // The output tensor of the new operator must not be the input tensor of any previously
+        // added operator.
+        for (auto dst_tensor : outputs)
+        {
+            if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+    /** Add an operator, while keeping the graph as a "linear sequence"
+     *
+     * PRECONDITION: The current graph is already linear
+     * INVARIANT: The list can only grow from head to tail
+     * INVARIANT: POSTCONDITION: The graph is linear
+     */
+    void add_operator_as_linear(OperatorId                   op,
+                                const std::vector<TensorId> &inputs,
+                                const std::vector<TensorId> &outputs,
+                                bool                         is_output = false)
+    {
+        const auto success = add_operator(op, inputs, outputs, is_output);
+        ARM_COMPUTE_UNUSED(success);
+        ARM_COMPUTE_ERROR_ON(!success);
+    }
+    /** Add a new operator
+     *  Return invalid if it violates the DAG invariant
+     *  Invalid operation will not change the graph
+     *
+     * @param[in] op        Operator to add
+     * @param[in] inputs    Input tensors to the operator
+     * @param[in] outputs   Output tensors to the operator
+     * @param[in] is_output Whether this is an output operator
+     */
+    bool add_operator(OperatorId                   op,
+                      const std::vector<TensorId> &inputs,
+                      const std::vector<TensorId> &outputs,
+                      bool                         is_output = false)
+    {
+        if (operator_exists(op))
+        {
+            return false;
+        }
+        _adj_src_tensors[op] = {};
+        _adj_dst_tensors[op] = {};
+        for (auto in_tensor : inputs)
+        {
+            // Linking input tensor to operator node will never create a cycle / loop because we guarantee
+            // each op is newly created, so every <input, op> pair / edge is new
+            link_input(op, in_tensor);
+        }
+        for (auto out_tensor : outputs)
+        {
+            // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
+            if (path_exists_from_tensor_to_op(out_tensor, op))
+            {
+                remove_operator(op);
+                return false;
+            }
+            else
+            {
+                link_output(op, out_tensor);
+            }
+        }
+
+        if (!is_output)
+        {
+            _last_op_available = true;
+            _last_op           = op;
+        }
+
+        return true;
+    }
+
+    /** Build a sequence of operators from the acyclic graph of operators.
+     *
+     * The graph will be visited in depth-first strategy. The operator can only be added to
+     * the sequence when all operators that supply the input tensors have been added. Otherwise,
+     * the operator will be ignored and later visited again. In other words, the dependency between
+     * operators will be preserved in the sequence.
+     */
+    std::vector<OpPack> build_operators_sequence() const
+    {
+        std::vector<OpPack> ops_seq;
+        std::set<Id>        done_ops;
+        std::set<Id>        done_tensors;
+
+        const auto input_tensors = global_src_tensors();
+
+        for (auto tensor : input_tensors)
+        {
+            done_tensors.insert(tensor);
+
+            for (auto op : _adj_dst_ops.at(tensor))
+            {
+                build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors);
+            }
+        }
+
+        return ops_seq;
+    }
+
+    /** Strict equality comparison (all internal ids and order of insertion matter).
+     *        In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+     *
+     *
+     * @param[in] g0
+     * @param[in] g1
+     * @return true  If the same
+     * @return false Otherwise
+     */
+    friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+    {
+        // Do not compare id allocators
+        return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) ==
+               std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+    }
+    std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
+    {
+        return _adj_src_ops.at(tensor);
+    }
+    std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const
+    {
+        return _adj_dst_ops.at(tensor);
+    }
+    /** Get all tensors
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> all_tensors() const
+    {
+        std::vector<TensorId> tensors{};
+        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors),
+                       [](const auto &it) { return it.first; });
+        return tensors;
+    }
+    /** Get source tensors of the whole graph
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> global_src_tensors() const
+    {
+        std::vector<TensorId> tensors;
+        for (auto tensor_src_ops : _adj_src_ops)
+        {
+            if (tensor_src_ops.second.empty())
+            {
+                tensors.push_back(tensor_src_ops.first);
+            }
+        }
+        return tensors;
+    }
+    /** Get destination tensors of the whole graph
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> global_dst_tensors() const
+    {
+        std::vector<TensorId> tensors;
+        for (auto tensor_dst_ops : _adj_dst_ops)
+        {
+            if (tensor_dst_ops.second.empty())
+            {
+                tensors.push_back(tensor_dst_ops.first);
+            }
+        }
+        return tensors;
+    }
+    /** Get intermediate tensors of the whole graph.
+     *
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> intermediate_tensors() const
+    {
+        std::vector<TensorId> tensors;
+
+        // If a tensor is used to connect the input of an operator and the output of another operator,
+        // it is not allocated in the memory. The tensor exists as a temporary variable only.
+        for (auto src_tensor : _adj_src_ops)
+        {
+            if (!src_tensor.second.empty())
+            {
+                const auto dst_tensor = _adj_dst_ops.find(src_tensor.first);
+                if (dst_tensor != _adj_dst_ops.end())
+                {
+                    if (!dst_tensor->second.empty())
+                    {
+                        tensors.push_back(src_tensor.first);
+                    }
+                }
+            }
+        }
+
+        return tensors;
+    }
+    /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+     *
+     * @return std::vector<OperatorId>
+     */
+    std::vector<OperatorId> get_root_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        const auto              op_list = all_ops();
+
+        for (auto op : op_list)
+        {
+            if (src_ops(op).empty())
+            {
+                ops.emplace_back(op);
+            }
+        }
+        return ops;
+    }
+
+private:
+    void link_input(OperatorId op, TensorId in_tensor)
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        if (!tensor_exists(in_tensor))
+        {
+            insert_new_tensor(in_tensor);
+        }
+        ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking
+        _adj_src_tensors[op].push_back(in_tensor);
+        _adj_dst_ops[in_tensor].push_back(op);
+    }
+    void link_output(OperatorId op, TensorId out_tensor)
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        if (!tensor_exists(out_tensor))
+        {
+            insert_new_tensor(out_tensor);
+        }
+        ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking
+        _adj_dst_tensors[op].push_back(out_tensor);
+        _adj_src_ops[out_tensor].push_back(op);
+    }
+
+    std::vector<OperatorId> src_ops(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        std::vector<OperatorId> ops{};
+        for (TensorId src_tensor : src_tensors(op))
+        {
+            ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+        }
+        return ops;
+    }
+    std::vector<OperatorId> dst_ops(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        std::vector<OperatorId> ops{};
+        for (TensorId dst_tensor : _adj_dst_tensors.at(op))
+        {
+            ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+        }
+        return ops;
+    }
+
+    /** Get source tensors to an operator
+     *
+     * @param[in] op
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> src_tensors(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        return _adj_src_tensors.at(op);
+    }
+    /** Get destination tensors to an operator
+     *
+     * @param[in] op
+     * @return std::vector<TensorId>
+     */
+    std::vector<TensorId> dst_tensors(OperatorId op) const
+    {
+        ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+        return _adj_dst_tensors.at(op);
+    }
+    /** Get all operators
+     *
+     * @return std::vector<OperatorId>
+     */
+    std::vector<OperatorId> all_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops),
+                       [](const auto &it) { return it.first; });
+        return ops;
+    }
+    /** Remove an operator from graph.
+     *
+     * @param[in] op
+     */
+    void remove_operator(OperatorId op)
+    {
+        for (auto src_tensor : _adj_src_tensors.at(op))
+        {
+            auto &dst_ops = _adj_dst_ops.at(src_tensor);
+            dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops));
+        }
+        for (auto dst_tensor : _adj_dst_tensors.at(op))
+        {
+            auto &src_ops = _adj_src_ops.at(dst_tensor);
+            src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops));
+        }
+        // Remove any isolated tensors
+        // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
+        for (auto t : all_tensors())
+        {
+            if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+            {
+                _adj_src_ops.erase(t);
+                _adj_dst_ops.erase(t);
+            }
+        }
+        _adj_src_tensors.erase(op);
+        _adj_dst_tensors.erase(op);
+    }
+    void insert_new_tensor(TensorId tensor)
+    {
+        _adj_src_ops[tensor] = {};
+        _adj_dst_ops[tensor] = {};
+    }
+    bool tensor_exists(TensorId tensor) const
+    {
+        return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+    }
+    bool operator_exists(OperatorId op) const
+    {
+        return _adj_src_tensors.find(op) != _adj_src_tensors.end() &&
+               _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+    }
+    bool is_src_tensor_of(OperatorId op, TensorId tensor) const
+    {
+        if (!operator_exists(op) || !tensor_exists(tensor))
+        {
+            return false;
+        }
+        const auto op_inputs = src_tensors(op);
+        return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+    }
+    bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
+    {
+        if (!operator_exists(op) || !tensor_exists(tensor))
+        {
+            return false;
+        }
+        const auto op_outputs = dst_tensors(op);
+        return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+    }
+    bool are_connected(OperatorId op, TensorId tensor) const
+    {
+        return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+    }
+    /** If op is the destination / leaf operator of the whole graph
+     *
+     * @param[in] op
+     * @return true
+     * @return false
+     */
+    bool is_dst_op(OperatorId op) const
+    {
+        return dst_ops(op).empty();
+    }
+    std::vector<OperatorId> get_dst_ops() const
+    {
+        std::vector<OperatorId> ops{};
+        const auto              op_list = all_ops();
+
+        for (auto op : op_list)
+        {
+            if (is_dst_op(op))
+            {
+                ops.emplace_back(op);
+            }
+        }
+        return ops;
+    }
+    bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
+    {
+        if (!tensor_exists(src_tensor) || !operator_exists(dst_op))
+        {
+            return false;
+        }
+        for (auto child_op : dst_ops_from_tensor(src_tensor))
+        {
+            if (path_exists_from_op_to_op(child_op, dst_op))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
+    {
+        if (!operator_exists(src_op) || !operator_exists(dst_op))
+        {
+            return false;
+        }
+        if (src_op == dst_op)
+        {
+            return true;
+        }
+        if (is_in(src_op, get_dst_ops()))
+        {
+            return false;
+        }
+        for (auto child_tensor : dst_tensors(src_op))
+        {
+            if (path_exists_from_tensor_to_op(child_tensor, dst_op))
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void build_operators_sequence_from_op(Id                   op,
+                                          std::vector<OpPack> &ops_seq,
+                                          std::set<Id>        &done_ops,
+                                          std::set<Id>        &done_tensors) const
+    {
+        while (true)
+        {
+            // If the operator has been added to the sequence, ignore it.
+            if (done_ops.find(op) != done_ops.end())
+            {
+                return;
+            }
+
+            // If not all the input tensors of the operator are available, this operator cannot be
+            // added to the sequence for now. It will be visited again after the source operator
+            // is added to the sequence.
+            const auto src_tensors = _adj_src_tensors.at(op);
+
+            for (auto src : src_tensors)
+            {
+                if (done_tensors.find(src) == done_tensors.end())
+                {
+                    return;
+                }
+            }
+
+            // This operator is ready to be added to the sequence.
+            const auto dst_tensors = _adj_dst_tensors.at(op);
+
+            done_ops.insert(op);
+
+            OpPack pack{op, src_tensors, dst_tensors};
+            ops_seq.push_back(pack);
+
+            done_tensors.insert(dst_tensors.begin(), dst_tensors.end());
+
+            // Visit all the sink operators.
+            // Call this function recursively unless there is only one sink.
+            if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
+            {
+                op = _adj_dst_ops.at(dst_tensors[0])[0];
+            }
+            else
+            {
+                for (auto dst_tensor : dst_tensors)
+                {
+                    const auto dst_ops = _adj_dst_ops.at(dst_tensor);
+
+                    for (auto dst_op : dst_ops)
+                    {
+                        build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors);
+                    }
+                }
+
+                return;
+            }
+        }
+    }
+
+private:
+    AdjList _adj_src_tensors{};
+    AdjList _adj_dst_tensors{};
+    AdjList _adj_src_ops{};
+    AdjList _adj_dst_ops{};
+
+    bool       _last_op_available{false};
+    OperatorId _last_op{0};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */
diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h
new file mode 100644
index 0000000000..3f4a2edd03
--- /dev/null
+++ b/src/dynamic_fusion/utils/Utils.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_UTILS_UTILS
+#define SRC_DYNAMIC_FUSION_UTILS_UTILS
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Tensor should have backing memory. @ref MemoryType
+ */
+inline bool is_alloc_tensor(const ITensorInfo *tensor_info)
+{
+    return tensor_info->id() > ITensorInfo::invalid_tensor_id;
+}
+
+/** Tensor should not have backing memory. @ref MemoryType
+ */
+inline bool is_noalloc_tensor(const ITensorInfo *tensor_info)
+{
+    return tensor_info->id() < ITensorInfo::invalid_tensor_id;
+}
+
+/** @ref ITensorInfo has valid id
+ */
+inline bool is_valid_tensor(const ITensorInfo *tensor_info)
+{
+    return tensor_info->has_valid_id();
+}
+
+/** @ref ITensorInfo has invalid id
+ */
+inline bool is_invalid_tensor(const ITensorInfo *tensor_info)
+{
+    return !is_valid_tensor(tensor_info);
+}
+
+/** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo
+*/
+inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr,
+                                                       bool                    mixed_precision = false,
+                                                       DataLayout              data_layout     = DataLayout::NHWC)
+{
+    // Create PadStrideInfo
+    const Size2D        stride  = pool_attr.stride();
+    const Padding2D     padding = pool_attr.pad();
+    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top,
+                                   arm_compute::DimensionRoundingType::FLOOR);
+
+    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride,
+                            pool_attr.exclude_padding(), mixed_precision);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */