5 files changed, 784 insertions, 0 deletions
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
new file mode 100644
index 0000000000..eab5cddd07
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClKernelRuntime.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using namespace arm_compute::opencl;
+
+void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
+{
+    // Create kernel from kernel source string
+    opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(
+                              code.name(),
+                              code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
+                              // Each program contains exactly one kernel
+                              code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                              code.build_options().options(), false /* Is source binary */));
+
+    // Configure execution window
+    IClKernel::configure_internal(code.window());
+
+    // Set config id for lws tuning
+    _config_id = code.config_id();
+
+    // Set kernel arguments
+    _arguments = code.arguments();
+}
+
+inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
+                                                 const GpuKernelArgumentBinding &arg,
+                                                 const ICLTensor                *tensor,
+                                                 std::vector<cl::Image2D>       &cl_images)
+{
+    switch (arg.type())
+    {
+        case GpuKernelArgumentBinding::Type::TensorStorage:
+        {
+            switch (arg.tensor_storage_type())
+            {
+                case TensorStorageType::ClBufferUint8Ptr:
+                {
+                    cl_add_buffer_argument(_kernel, idx, tensor->cl_buffer());
+                    break;
+                }
+                case TensorStorageType::ClImage2dReadOnly:
+                {
+                    cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::ReadOnly);
+                    cl_images.push_back(tensor_image2d);
+                    cl_add_texture_argument(_kernel, idx, tensor_image2d);
+                    break;
+                }
+                case TensorStorageType::ClImage2dWriteOnly:
+                {
+                    cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::WriteOnly);
+                    cl_images.push_back(tensor_image2d);
+                    cl_add_texture_argument(_kernel, idx, tensor_image2d);
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Do not accept other TensorStorageType");
+                    break;
+                }
+            }
+            break;
+        }
+        case GpuKernelArgumentBinding::Type::TensorComponent:
+        {
+            cl_add_tensor_component_argument(_kernel, idx, tensor, arg.tensor_component_type());
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Do not accept other types of kernel arguments");
+            break;
+        }
+    }
+}
+
+void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
+    /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
+    constexpr bool skip_sliding_window  = false;
+    constexpr bool use_dummy_work_items = false;
+
+    unsigned int idx = 0;
+    do
+    {
+        // Set kernel arguments
+        // CLImages created from tensor arguments. Need to be retained until enqueue
+        std::vector<cl::Image2D> cl_images;
+
+        for (const auto &arg : _arguments)
+        {
+            auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+            add_kernel_argument(idx, arg, tensor, cl_images);
+        }
+
+        // Dispatch kernel
+        enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
+    } while (skip_sliding_window && window.slide_window_slice_3D(slice));
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
new file mode 100644
index 0000000000..148e4db581
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelSourceCode;
+
+/** OpenCL runtime to run a single kernel */
+class ClKernelRuntime final : public opencl::IClKernel
+{
+public:
+    /** Configure the kernel runtime
+     *
+     * @param[in] compile_ctx OpenCL compile context
+     * @param[in] code        Kernel source code
+     */
+    void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code);
+    /** Run the kernel
+     *
+     * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories
+     * @param[in]     window  Execution window
+     * @param[in]     queue   OpenCL command queue
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    /** Set a kernel argument as part of a tensor
+     *
+     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     arg       Kernel argument binding, as part of @p tensor
+     * @param[in]     tensor    Tensor of which the kernel argument @p arg is a part of
+     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+     */
+    inline void add_kernel_argument(unsigned int                   &idx,
+                                    const GpuKernelArgumentBinding &arg,
+                                    const ICLTensor                *tensor,
+                                    std::vector<cl::Image2D>       &cl_images);
+
+private:
+    GpuKernelArgumentList _arguments{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
new file mode 100644
index 0000000000..3500a0e60d
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
+ */
+class ClAuxTensors
+{
+public:
+    /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
+     */
+    struct DataView
+    {
+        DataView() = default;
+        DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
+        {
+        }
+        ~DataView()                                = default;
+        DataView(const DataView &other)            = default;
+        DataView &operator=(const DataView &other) = default;
+        DataView(DataView &&other)                 = default;
+        DataView     &operator=(DataView &&other)  = default;
+        CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
+        TensorInfo    tensor_info{}; /**< Associated tensor info */
+        AuxMemoryInfo memory_info{}; /**< Memory requirement */
+    };
+
+    /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
+    std::vector<DataView> get_tensors()
+    {
+        return _tensors;
+    }
+    std::vector<DataView> get_tensors() const
+    {
+        return _tensors;
+    }
+
+    friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
+
+private:
+    /** Add auxiliary tensor.
+     *
+     * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
+     * @param[in] memory_info Memory requirements of the auxiliary tensor
+     *
+     * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr
+     */
+    CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
+    {
+        const auto t_id             = tensor_info.id();
+        auto       find_tensor_pair = _owned_tensors.find(t_id);
+        if (find_tensor_pair != _owned_tensors.end())
+        {
+            return find_tensor_pair->second.get();
+        }
+        else
+        {
+            auto tensor        = std::make_unique<CLTensor>();
+            auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
+            auto new_tensor    = inserted_pair->second.get();
+            _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
+            return new_tensor;
+        }
+    }
+
+    std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
+    std::vector<DataView>                                _tensors{};
+};
+/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
+ *
+ * @note This is the only recommended method for user to create @ref ClAuxTensors
+ *
+ * @param[out] aux_tensors Auxiliary tensors required by the workload code
+ * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to
+ *
+ * @return Status
+ */
+Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
+{
+    for (auto t_id : code.tensors())
+    {
+        // Get tensor object
+        const auto workload_arg  = code.query_tensor(t_id);
+        ICLTensor *tensor_object = nullptr;
+        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        {
+            // Create aux tensor CLTensor object
+            const TensorInfo tensor_info = *workload_arg->tensor_info();
+            ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
+            const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
+            tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
+
+            if (tensor_object == nullptr)
+            {
+                return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
+            }
+        }
+    }
+    return Status{};
+}
+
+/** A fast tensor lookup table for runtime tensor objects retrieval
+ */
+class ClTensorLUT
+{
+public:
+    /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        auto tensor_pack = _tensor_packs.find(uwk_id);
+        if (tensor_pack != _tensor_packs.end())
+        {
+            return &(tensor_pack->second);
+        }
+        return nullptr;
+    }
+    /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+     *
+     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+     *
+     * @return ITensorPack*
+     */
+    ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
+    {
+        return _tensor_packs.at(uwk_id);
+    }
+
+    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                                    const GpuWorkloadSourceCode   &code,
+                                    const std::vector<CLTensor *> &user_tensors,
+                                    const ClAuxTensors            &aux_tensors);
+
+private:
+    /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
+     *
+     * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack
+     * @param[in] tensor_pack Tensor pack to be added
+     */
+    void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
+    {
+        _tensor_packs[uwk_id] = tensor_pack;
+    }
+    std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
+};
+
+/** Create a fast tensor lookup table for runtime tensor retrieval
+ *
+ * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
+ * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to
+ * @param[in]  user_tensors User tensors
+ * @param[in]  aux_tensors  Auxiliary tensors required by the workload code
+ *
+ * @return Status
+ */
+Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                         const GpuWorkloadSourceCode   &code,
+                         const std::vector<CLTensor *> &user_tensors,
+                         const ClAuxTensors            &aux_tensors)
+{
+    // Combine user tensors and aux tensors
+    std::map<ITensorInfo::Id, CLTensor *> tensor_map;
+    for (auto tensor : user_tensors)
+    {
+        const auto t_id = tensor->info()->id();
+
+        if (tensor_map.find(t_id) != tensor_map.end())
+        {
+            // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
+            std::vector<ITensorInfo::Id> ids;
+            for (auto &t : tensor_map)
+            {
+                ids.push_back(t.first);
+            }
+            ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;
+            tensor_map[new_id]     = tensor;
+        }
+        else
+        {
+            tensor_map[t_id] = tensor;
+        }
+    }
+    for (const auto &data : aux_tensors.get_tensors())
+    {
+        const auto t_id   = data.tensor_info.id();
+        const auto tensor = data.tensor;
+        if (tensor_map.find(t_id) != tensor_map.end())
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+        }
+        tensor_map[t_id] = tensor;
+    }
+
+    // Add tensor objects into corresponding tensor packs
+    for (auto id_tensor : tensor_map)
+    {
+        const auto t_id          = id_tensor.first;
+        const auto tensor_object = id_tensor.second;
+        if (tensor_object == nullptr)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+        }
+        if (tensor_object->allocator()->info().total_size() == 0U)
+        {
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
+        }
+
+        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        {
+            ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
+            if (tensor_pack == nullptr)
+            {
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
+            }
+            else
+            {
+                tensor_pack->add_tensor(t_id, tensor_object);
+            }
+        }
+    }
+
+    return Status{};
+}
+
+} // namespace
+
+struct ClWorkloadRuntime::Implementation
+{
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
+    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
+    bool                                                       _is_configured{false};
+    bool                                                       _is_prepared{false};
+    ClTensorLUT                                                _tensor_lut{};
+    ClAuxTensors                                               _aux_tensors{};
+    GpuWorkloadSourceCode                                      _source_code{};
+};
+
+ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
+{
+}
+
+ClWorkloadRuntime::~ClWorkloadRuntime() = default;
+
+ClWorkloadRuntime::ClWorkloadRuntime(ClWorkloadRuntime &&) = default;
+
+ClWorkloadRuntime &ClWorkloadRuntime::operator=(ClWorkloadRuntime &&) = default;
+
+Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
+                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    // Generate source code
+    _impl->_source_code = sketch.implementation().generate_source_code();
+    // Configure unit workload from source code
+    for (auto uwk_id : _impl->_source_code.unit_workloads())
+    {
+        const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
+        const auto stage = work.stage().stage;
+        auto       k     = std::make_unique<ClKernelRuntime>();
+        k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
+
+        switch (stage)
+        {
+            case UnitWorkloadStage::Stage::Run:
+            {
+                _impl->_kernels.emplace(work.id(), std::move(k));
+                break;
+            }
+            case UnitWorkloadStage::Stage::Prepare:
+            {
+                _impl->_kernels_prep.emplace(work.id(), std::move(k));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Invalid unit workload stage");
+            }
+        }
+    }
+    // Create auxiliary tensor objects
+    create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
+    _impl->_is_configured = true;
+    return Status{};
+}
+
+void ClWorkloadRuntime::prepare()
+{
+    if (!_impl->_is_prepared)
+    {
+        for (auto &id_kernel_pair : _impl->_kernels_prep)
+        {
+            const bool flush_queue = false;
+            const auto uwk_id      = id_kernel_pair.first;
+            auto       kernel      = id_kernel_pair.second.get();
+            CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+        }
+
+        _impl->_is_prepared = true;
+    }
+}
+
+Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
+{
+    // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
+    // in which case the lut can be cached during prepare
+    const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
+    ARM_COMPUTE_RETURN_ON_ERROR(st);
+    prepare();
+    for (auto &id_kernel_pair : _impl->_kernels)
+    {
+        // Flush the command queue on the last kernel
+        const bool flush_queue = false;
+        const auto uwk_id      = id_kernel_pair.first;
+        auto       kernel      = id_kernel_pair.second.get();
+        CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+    }
+    return Status{};
+}
+
+std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
+{
+    std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
+    for (const auto &data : _impl->_aux_tensors.get_tensors())
+    {
+        aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
+    }
+    return aux_tensors;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
new file mode 100644
index 0000000000..7044b0ea66
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "GpuCkwKernelArgumentsHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+    const auto *info    = tensor->info();
+    const auto &strides = info->strides_in_bytes();
+
+    switch (component)
+    {
+        case TensorComponentType::OffsetFirstElement:
+            kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());
+            break;
+        case TensorComponentType::Stride0:
+            kernel.setArg<cl_uint>(idx++, strides[0]);
+            break;
+        case TensorComponentType::Stride1:
+            kernel.setArg<cl_uint>(idx++, strides[1]);
+            break;
+        case TensorComponentType::Stride2:
+            kernel.setArg<cl_uint>(idx++, strides[2]);
+            break;
+        case TensorComponentType::Stride3:
+            kernel.setArg<cl_uint>(idx++, strides[3]);
+            break;
+        case TensorComponentType::Stride4:
+            kernel.setArg<cl_uint>(idx++, strides[4]);
+            break;
+        case TensorComponentType::Dim0:
+            kernel.setArg<cl_uint>(idx++, info->dimension(0));
+            break;
+        case TensorComponentType::Dim1:
+            kernel.setArg<cl_uint>(idx++, info->dimension(1));
+            break;
+        case TensorComponentType::Dim2:
+            kernel.setArg<cl_uint>(idx++, info->dimension(2));
+            break;
+        case TensorComponentType::Dim3:
+            kernel.setArg<cl_uint>(idx++, info->dimension(3));
+            break;
+        case TensorComponentType::Dim4:
+            kernel.setArg<cl_uint>(idx++, info->dimension(4));
+            break;
+        case TensorComponentType::Dim1xDim2:
+            kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2));
+            break;
+        case TensorComponentType::Dim2xDim3:
+            kernel.setArg<cl_uint>(idx++, info->dimension(2) * info->dimension(3));
+            break;
+        case TensorComponentType::Dim1xDim2xDim3:
+            kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2) * info->dimension(3));
+            break;
+        case TensorComponentType::Unknown:
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor component");
+    }
+}
+
+void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buffer &buffer)
+{
+    kernel.setArg(idx++, buffer);
+}
+
+void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image)
+{
+    kernel.setArg(idx++, image);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
new file mode 100644
index 0000000000..306d547acb
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Select a Compute Kernel Writer tensor component from a tensor and add to the kernel's arguments at the specified index idx.
+ *
+ * @param[in,out] kernel    OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx       Index at which to add the argument.
+ * @param[in]     tensor    Tensor from which to access the tensor component.
+ * @param[in]     component Tensor component to select such as tensor dimensions, strides, etc.
+ */
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component);
+
+/** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx    Index at which to add the argument.
+ * @param[in]     buffer OpenCL buffer containing the tensor's data.
+ */
+void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buffer &buffer);
+
+/** Add an OpenCL image object to the kernel's arguments at the specified index @p idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx    Index at which to add the argument.
+ * @param[in]     image  OpenCL image containing the image's data.
+ */
+void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS */