Add GpuKernelArgumentBinding for runtime argument setting

* Add flexible runtime argument setting that accept argument bindings exported from ckw. * Introduce internal build flag ACL_INTERNAL_TEST_CKW_IN_DF. If set to true, ckw will be tested in dynamic fusion validation tests. Otherwise it will not be tested and the dynamic fusion will keep using ClTemplateWriter instead. * Fix CKW sampler for elementwise binary to deal with tile sizes > 1 in both dimensions Resolves: COMPMID-6282 Partially resolves: COMPMID-6260 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: I0ab225a4484eb2119643d900a4e72806558626ee Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9917 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Anitha Raj <Anitha.Raj@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2023-06-28 09:49:45 +0100
committer: SiCong Li <sicong.li@arm.com> 2023-07-25 15:48:50 +0000
commit: 23882a9014eb3972bca958206866c8e0d0b829cc (patch)
tree: 9139b91699099160e26a64abd8cf182bd7447278
parent: 0a59e69fd922b02d9e3b5b043ee7f891061df7be (diff)
download: ComputeLibrary-23882a9014eb3972bca958206866c8e0d0b829cc.tar.gz
45 files changed, 906 insertions, 304 deletions
diff --git a/SConscript b/SConscript
index da7683ed6c..868bc9f8f5 100644
--- a/SConscript
+++ b/SConscript
@@ -564,7 +564,14 @@ if env['fixed_format_kernels']:
 # Experimental files
 # Dynamic fusion
 if env['experimental_dynamic_fusion']:
-    lib_files += filelist['experimental']['dynamic_fusion']
+    lib_files += filelist['experimental']['dynamic_fusion']['common']
+    lib_files += filelist['experimental']['dynamic_fusion']['template_writer']
+
+if "ACL_INTERNAL_TEST_CKW_IN_DF" in env["extra_cxx_flags"]:
+    if not env["experimental_dynamic_fusion"]:
+        print("To use ACL_INTERNAL_TEST_CKW_IN_DF experimental_dynamic_fusion must be set to 1")
+        Exit(1)
+    lib_files += filelist['experimental']['dynamic_fusion']['ckw_driver']
 
 # Logging files
 if env["logging"]:
diff --git a/compute_kernel_writer/prototype/include/ckw/Kernel.h b/compute_kernel_writer/prototype/include/ckw/Kernel.h
index 3deb2ace0d..ba31a29ba7 100644
--- a/compute_kernel_writer/prototype/include/ckw/Kernel.h
+++ b/compute_kernel_writer/prototype/include/ckw/Kernel.h
@@ -50,6 +50,11 @@ class Kernel
 public:
     /** Constructor
      *
+     * @param[in] language The programming language to write the kernel.
+     */
+    Kernel(GpuTargetLanguage language);
+    /** Constructor
+     *
      * @param[in] name     The name of the kernel function.
      * @param[in] language The programming language to write the kernel.
      */
@@ -61,6 +66,12 @@ public:
     /** Get the name of the kernel function. */
     const std::string &name() const;
 
+    /** Set the name of the kernel function.
+     *
+     * @param[in] name     The name of the kernel function.
+     */
+    void name(const std::string &name);
+
     /** Get the list of kernel arguments. */
     ::std::vector<KernelArgument> arguments() const;
 
diff --git a/compute_kernel_writer/prototype/src/Kernel.cpp b/compute_kernel_writer/prototype/src/Kernel.cpp
index 884b69afc6..095ac879f1 100644
--- a/compute_kernel_writer/prototype/src/Kernel.cpp
+++ b/compute_kernel_writer/prototype/src/Kernel.cpp
@@ -30,11 +30,17 @@
 namespace ckw
 {
 
+Kernel::Kernel(GpuTargetLanguage language)
+    : Kernel{"unnamed", language}
+{
+}
+
 Kernel::Kernel(const char *name, GpuTargetLanguage language)
     : _name(name), _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)), _operands{}, _tensor_id_operands{}
 {
 }
 
+
 Kernel::~Kernel()
 {
 }
@@ -44,6 +50,10 @@ const std::string &Kernel::name() const
     return _name;
 }
 
+void Kernel::name(const std::string& name)
+{
+    _name = name;
+}
 std::vector<KernelArgument> Kernel::arguments() const
 {
     std::vector<KernelArgument> arguments;
diff --git a/filelist.json b/filelist.json
index 18fcaa81a0..b7845a760a 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2286,74 +2286,80 @@
     }
   },
   "experimental": {
-    "dynamic_fusion": [
-      "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
-      "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
-      "src/dynamic_fusion/sketch/attributes/CastAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
-      "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
-      "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp",
-      "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp",
-      "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
-      "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp",
-      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
-      "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp"
-    ]
+    "dynamic_fusion": {
+      "common": [
+        "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+        "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
+        "src/dynamic_fusion/sketch/attributes/CastAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
+        "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+        "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp",
+        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp",
+        "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp"
+      ],
+      "template_writer": [
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
+        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp"
+      ],
+      "ckw_driver": [
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
+        "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp"
+      ]
+    }
   }
 }
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 11ca6c45f5..1e1ab7f545 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -43,6 +43,9 @@ def get_list_flags( filename, arch):
     flags.append("-DARM_COMPUTE_OPENCL_ENABLED")
     if arch == "aarch64":
         flags.append("-DARM_COMPUTE_AARCH64_V8_2")
+    if "ckw_driver" in filename:
+        flags.append("-DACL_INTERNAL_TEST_CKW_IN_DF")
+
     return flags
 
 def filter_files( list_files ):
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 6a57482bb2..92ca8557f1 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -24,6 +24,9 @@
 #include "ClKernelRuntime.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "src/core/CL/CLUtils.h"
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
 
@@ -57,6 +60,8 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
     _arguments = code.arguments();
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+
 inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
@@ -163,21 +168,65 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer
     }
 }
 
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images)
+{
+    switch(arg.type())
+    {
+        case GpuKernelArgumentBinding::Type::TensorStorage:
+        {
+            switch(arg.tensor_storage_type())
+            {
+                case TensorStorageType::ClBufferUint8Ptr:
+                {
+                    cl_add_buffer_argument(_kernel, idx, tensor->cl_buffer());
+                    break;
+                }
+                case TensorStorageType::ClImage2dReadOnly:
+                {
+                    cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::ReadOnly);
+                    cl_images.push_back(tensor_image2d);
+                    cl_add_texture_argument(_kernel, idx, tensor_image2d);
+                    break;
+                }
+                case TensorStorageType::ClImage2dWriteOnly:
+                {
+                    cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::WriteOnly);
+                    cl_images.push_back(tensor_image2d);
+                    cl_add_texture_argument(_kernel, idx, tensor_image2d);
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Do not accept other TensorStorageType");
+                    break;
+                }
+            }
+            break;
+        }
+        case GpuKernelArgumentBinding::Type::TensorComponent:
+        {
+            cl_add_tensor_component_argument(_kernel, idx, tensor, arg.tensor_component_type());
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Do not accept other types of kernel arguments");
+            break;
+        }
+    }
+}
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
     Window slice = window.first_slice_window_3D();
-    // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-    Window slice_fixed_z = slice;
-    slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
 
     /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
     /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
-    constexpr bool slide_along_dimz     = true;
     constexpr bool skip_sliding_window  = false;
     constexpr bool use_dummy_work_items = false;
 
@@ -185,23 +234,27 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
     do
     {
         // Set kernel arguments
-        Window arg_slice = slice;
         // CLImages created from tensor arguments. Need to be retained until enqueue
         std::vector<cl::Image2D> cl_images;
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
         for(auto id_arg : _arguments)
         {
             const auto arg    = id_arg.second;
             auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
-            if(!slide_along_dimz)
-            {
-                // The stride_z for matrix must be zero if we do not slice
-                ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
-                arg_slice = slice_fixed_z;
-            }
-            add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images);
+            add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, slice, cl_images);
+        }
+
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+        for(const auto &arg : _arguments)
+        {
+            auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+            add_kernel_argument(idx, arg, tensor, cl_images);
         }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
index 4787acabcd..92e73503ce 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,8 @@
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
+#include <vector>
+
 namespace arm_compute
 {
 namespace experimental
@@ -57,6 +59,7 @@ public:
     virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Set a kernel tensor argument
      *
      * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -66,9 +69,19 @@ private:
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
     inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Set a kernel argument as part of a tensor
+     *
+     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+     * @param[in]     arg       Kernel argument binding, as part of @p tensor
+     * @param[in]     tensor    Tensor of which the kernel argument @p arg is a part of
+     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+     */
+    inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images);
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 private:
-    GpuKernelArgumentList _arguments{}; /** All kernel arguments required by the runtime */
+    GpuKernelArgumentList _arguments{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
index b273c2a20c..84fb279237 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
@@ -26,7 +26,11 @@
 
 namespace arm_compute
 {
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, ICLTensor *tensor, ckw::TensorComponentType component)
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
 
@@ -35,49 +39,49 @@ void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, ICL
 
     switch(component)
     {
-        case ckw::TensorComponentType::OffsetFirstElement:
+        case TensorComponentType::OffsetFirstElement:
             kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());
             break;
-        case ckw::TensorComponentType::Stride0:
+        case TensorComponentType::Stride0:
             kernel.setArg<cl_uint>(idx++, strides[0]);
             break;
-        case ckw::TensorComponentType::Stride1:
+        case TensorComponentType::Stride1:
             kernel.setArg<cl_uint>(idx++, strides[1]);
             break;
-        case ckw::TensorComponentType::Stride2:
+        case TensorComponentType::Stride2:
             kernel.setArg<cl_uint>(idx++, strides[2]);
             break;
-        case ckw::TensorComponentType::Stride3:
+        case TensorComponentType::Stride3:
             kernel.setArg<cl_uint>(idx++, strides[3]);
             break;
-        case ckw::TensorComponentType::Stride4:
+        case TensorComponentType::Stride4:
             kernel.setArg<cl_uint>(idx++, strides[4]);
             break;
-        case ckw::TensorComponentType::Dim0:
+        case TensorComponentType::Dim0:
             kernel.setArg<cl_uint>(idx++, info->dimension(0));
             break;
-        case ckw::TensorComponentType::Dim1:
+        case TensorComponentType::Dim1:
             kernel.setArg<cl_uint>(idx++, info->dimension(1));
             break;
-        case ckw::TensorComponentType::Dim2:
+        case TensorComponentType::Dim2:
             kernel.setArg<cl_uint>(idx++, info->dimension(2));
             break;
-        case ckw::TensorComponentType::Dim3:
+        case TensorComponentType::Dim3:
             kernel.setArg<cl_uint>(idx++, info->dimension(3));
             break;
-        case ckw::TensorComponentType::Dim4:
+        case TensorComponentType::Dim4:
             kernel.setArg<cl_uint>(idx++, info->dimension(4));
             break;
-        case ckw::TensorComponentType::Dim1xDim2:
+        case TensorComponentType::Dim1xDim2:
             kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2));
             break;
-        case ckw::TensorComponentType::Dim2xDim3:
+        case TensorComponentType::Dim2xDim3:
             kernel.setArg<cl_uint>(idx++, info->dimension(2) * info->dimension(3));
             break;
-        case ckw::TensorComponentType::Dim1xDim2xDim3:
+        case TensorComponentType::Dim1xDim2xDim3:
             kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2) * info->dimension(3));
             break;
-        case ckw::TensorComponentType::Unknown:
+        case TensorComponentType::Unknown:
         default:
             ARM_COMPUTE_ERROR("Unknown tensor component");
     }
@@ -93,4 +97,6 @@ void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Im
     kernel.setArg(idx++, image);
 }
 
+} // namespace dynamic_fusion
+} // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
index 3c785732a5..4cbb157a48 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
@@ -27,10 +27,14 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 
-#include "ckw/TensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 namespace arm_compute
 {
+namespace experimental
+{
+namespace dynamic_fusion
+{
 /** Select a Compute Kernel Writer tensor component from a tensor and add to the kernel's arguments at the specified index idx.
  *
  * @param[in,out] kernel    OpenCL kernel to configure with the provided argument.
@@ -38,7 +42,7 @@ namespace arm_compute
  * @param[in]     tensor    Tensor from which to access the tensor component.
  * @param[in]     component Tensor component to select such as tensor dimensions, strides, etc.
  */
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, ICLTensor *tensor, ckw::TensorComponentType component);
+void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component);
 
 /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
  *
@@ -56,6 +60,8 @@ void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buf
  */
 void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image);
 
+} // namespace dynamic_fusion
+} // namespace experimental
 } // namespace arm_compute
 
 #endif /* ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 302d4c8562..226e1a2df3 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -33,6 +33,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 /** Contain information required to set up a kernel argument at run time
+ * @deprecated To be removed along with ClTemplateWriter
  */
 struct GpuKernelArgumentInfo
 {
@@ -66,10 +67,9 @@ struct GpuKernelArgumentInfo
     }
     Type type{ Type::Tensor_4D_t_Buffer };
 };
-
 bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
-
 /** Kernel argument information linked with its corresponding @ref ITensorInfo
+ * @deprecated To be removed along with ClTemplateWriter
  */
 class GpuKernelArgument
 {
@@ -124,6 +124,130 @@ private:
     TensorInfo            _tensor_info{};
     GpuKernelArgumentInfo _kernel_arg_info{};
 };
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+/** Describe how the tensor runtime memory can be accessed
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorStorageType
+{
+    Unknown,
+    ClBufferUint8Ptr,
+    ClImage2dReadOnly,
+    ClImage2dWriteOnly,
+};
+
+/** Describe additional runtime information about the tensor
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorComponentType
+{
+    Unknown,
+    OffsetFirstElement,
+    Stride0,
+    Stride1,
+    Stride2,
+    Stride3,
+    Stride4,
+    Dim0,
+    Dim1,
+    Dim2,
+    Dim3,
+    Dim4,
+    Dim1xDim2,
+    Dim2xDim3,
+    Dim1xDim2xDim3,
+};
+
+/** Describe how to extract information from a runtime Gpu tensor, and set it as an argument to a gpu kernel at runtime
+ *
+ * A kernel argument is just an argument to the gpu kernel as shown in the argument list below. This contrasts with a "workload argument" which is a tensor (@ref GpuWorkloadArgument)
+ * void kernel(arg0, arg1, ... argN)
+ *
+ * In a kernel generated using dynamic fusion (@ref GpuKernelSourceCode), every kernel argument describes part of a tensor.
+ * A tensor is described as: **storages** followed by **components**
+ *
+ * A storage (@ref TensorStorageType) describes how the tensor runtime memory can be accessed (e.g. via a global uint8 pointer to a CL buffer)
+ * A component (@ref TensorComponentType) describes additional runtime information about the tensor (e.g. the dimensions of the tensor)
+ *
+ * The arguments are arranged in the order of use in the generated kernel code:
+ *
+ *  arg0   , arg1      , arg2      ,                         ...,                         , argN
+ *  storage, component0, component1, ..., componentX, storage, component0, component1, ..., componentY
+ * |                   tensor0                       |                    tensor1                    |
+ *
+ * An example argument list:
+ *
+ * void kernel(
+ *  image2d_t       t0_image,               // TensorStorageType::ClImage2dReadOnly
+ *  uint8_t*        t0_ptr,                 // TensorStorageType::ClBufferUint8Ptr
+ *  uint            t0_dim0,                // TensorComponentType::Dim0
+ *  uint            t0_stride1,             // TensorComponentType::Stride1
+ *  image2d_t       t1_ptr,                 // TensorStorageType::ClImage2dReadOnly
+ *  uint            t1_dim1xdim2,           // TensorComponentType::Dim1xDim2
+ *  uint            t1_stride1,             // TensorComponentType::Stride1
+ *  uint            t1_stride2,             // TensorComponentType:Stride2
+ * )
+ *
+ */
+class GpuKernelArgumentBinding
+{
+public:
+    enum class Type : int32_t
+    {
+        TensorStorage,  /** @ref TensorStorageType */
+        TensorComponent /** @ref TensorComponentType */
+    };
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
+        : _type{ Type::TensorStorage }, _id{ id }, _value{}
+    {
+        _value.tensor_storage_type = storage;
+    }
+    GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
+        : _type{ Type::TensorComponent }, _id{ id }, _value{}
+    {
+        _value.tensor_component_type = component;
+    }
+    /** Storage type of the tensor
+     */
+    TensorStorageType tensor_storage_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorStorage);
+        return _value.tensor_storage_type;
+    }
+    /** Component of the tensor
+     */
+    TensorComponentType tensor_component_type() const
+    {
+        ARM_COMPUTE_ERROR_ON(_type != Type::TensorComponent);
+        return _value.tensor_component_type;
+    }
+    /** Id of the tensor this kernel argument belongs to
+     */
+    ITensorInfo::Id id() const
+    {
+        return _id;
+    }
+    /** Type of the kernel argument
+     */
+    Type type() const
+    {
+        return _type;
+    }
+
+private:
+    Type            _type;
+    ITensorInfo::Id _id;
+    union Value
+    {
+        TensorStorageType   tensor_storage_type;
+        TensorComponentType tensor_component_type;
+    };
+    Value _value;
+};
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index b70a192775..5a65ede38b 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -44,14 +44,14 @@ std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(c
     return tensor_ids;
 }
 
-GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services)
-    : _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
+    : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{}
 {
 }
 
 GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
 {
-    GpuKernelComponentStream stream{ _services, mem_map };
+    GpuKernelComponentStream stream{ _context, _services, mem_map };
     const auto               op_seq = _dependency_graph.build_operators_sequence();
 
     stream.new_component_group();
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
index 8314ea0a50..85c9b45840 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
@@ -49,9 +49,10 @@ class GpuKernelComponentGraph
 public:
     /** Constructor
      *
+     * @param[in] context  @ref GpuWorkloadContext to be used by the graph
      * @param[in] services @ref GpuComponentServices to be used by the graph
      */
-    GpuKernelComponentGraph(GpuComponentServices *services);
+    GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services);
     /** Prevent instances of this class from being copy constructed */
     GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
     /** Prevent instances of this class from being copied */
@@ -98,6 +99,7 @@ public:
 
 private:
     static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+    GpuWorkloadContext   *_context;
     GpuComponentServices *_services;
     std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
     std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
@@ -106,4 +108,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index 8f4eadc477..a2b6623370 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,8 +33,8 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
-    : _services{ services }, _component_groups{}, _mem_map{ mem_map }
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
+    : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map }
 {
 }
 
@@ -51,7 +51,7 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
         const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
         // The whole unit workload stage is determined by the root component
         const auto unit_workload_stage = group.get_root_component()->properties().stage();
-        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map);
+        source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map, _context);
     }
     return source_code;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
index cbaa7c297b..ba2503a938 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,10 +49,11 @@ class GpuKernelComponentStream
 public:
     /** Constructor
      *
+     * @param[in] context  @ref GpuWorkloadContext to be used throughout the stream
      * @param[in] services @ref GpuComponentServices to be used throughout the stream
      * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
      */
-    GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
     /** Allow instances of this class to be copy constructed */
     GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
     /** Allow instances of this class to be copied */
@@ -78,6 +79,7 @@ public:
     bool add_component(IGpuKernelComponent *component);
 
 private:
+    GpuWorkloadContext                  *_context;
     GpuComponentServices                *_services;
     std::vector<GpuKernelComponentGroup> _component_groups{};
     MemoryDescriptorMap                  _mem_map{};
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 7479328d7b..64e1cdc3bc 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include <map>
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include <deque>
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include <string>
 
 namespace arm_compute
@@ -38,7 +42,11 @@ namespace experimental
 namespace dynamic_fusion
 {
 /** The argument list of a @ref GpuKernelSourceCode */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 /** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
  */
@@ -123,4 +131,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index 00f625de28..c99984fc0e 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,11 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -46,11 +50,19 @@ GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKern
 GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
 {
     GpuKernelSourceCode code;
-    ClTemplateWriter    writer{ _comp_group };
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+    ClTemplateWriter writer { _comp_group };
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuCkwDriver writer { _comp_group };
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     code.name(writer.get_name());
     code.code(writer.get_code());
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     code.arguments(writer.get_tensors());
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    code.arguments(writer.get_kernel_arguments());
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     code.build_options(writer.get_build_options());
     code.config_id(writer.get_config_id());
     code.window(writer.get_window());
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
index 50f34d9c14..c2bd012703 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -32,9 +32,8 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-
 GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
-    : _impl { std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
+    : _impl{ std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
 {
 }
 
@@ -75,8 +74,7 @@ const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const
 }
 
 GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
-    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx),
-      _next_tensor_id(1), _mem_map()
+    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info()
 {
 }
 
@@ -103,26 +101,39 @@ void GpuWorkloadContext::Impl::register_user_tensor(ITensorInfo &tensor_info)
 
     tensor_info.set_id(tensor_id);
     _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User };
+    // Save a *copy* of the user tensor info in workload context for future reference
+    // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
+    _managed_tensor_info.emplace(tensor_info.id(), std::make_unique<TensorInfo>(tensor_info));
 }
 
-void GpuWorkloadContext::Impl::register_aux_tensor(ITensorInfo &tensor_info, const AuxMemoryInfo &mem_info)
+ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor()
 {
-    ARM_COMPUTE_ERROR_ON(tensor_info.has_valid_id());
-
-    const auto tensor_id = next_tensor_id();
-
-    tensor_info.set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, mem_info };
+    auto       tensor_info = std::make_unique<TensorInfo>();
+    const auto tensor_id   = -next_tensor_id();
+    tensor_info->set_id(tensor_id);
+    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual };
+    auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+    return inserted.first->second.get();
 }
 
-void GpuWorkloadContext::Impl::register_virtual_tensor(ITensorInfo &tensor_info)
+ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo &itensor_info)
 {
-    ARM_COMPUTE_ERROR_ON(tensor_info.has_valid_id());
+    auto       tensor_info = std::make_unique<TensorInfo>(itensor_info);
+    const auto tensor_id   = next_tensor_id();
+    tensor_info->set_id(tensor_id);
+    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } };
+    auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+    return inserted.first->second.get();
+}
 
-    const auto tensor_id = -next_tensor_id();
+ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id)
+{
+    return _managed_tensor_info.at(id).get();
+}
 
-    tensor_info.set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual };
+const ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id) const
+{
+    return _managed_tensor_info.at(id).get();
 }
 
 ITensorInfo::Id GpuWorkloadContext::Impl::next_tensor_id()
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
index a857932791..c169476a70 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
@@ -36,7 +36,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-
 /** Internal implementation of workload context. */
 class GpuWorkloadContext::Impl
 {
@@ -52,7 +51,7 @@ public:
     Impl(Impl &) = default;
 
     /** Assignment */
-    Impl& operator=(Impl &) = default;
+    Impl &operator=(Impl &) = default;
 
     /** Get target GPU language. */
     GpuLanguage gpu_language() const;
@@ -69,27 +68,34 @@ public:
      */
     void register_user_tensor(ITensorInfo &tensor_info);
 
-    /** Set a new ID and register the auxiliary tensor info.
+    /** Create a virtual (see @ref MemoryType) tensor info and save it
      *
-     * @param[in, out] tensor_info The tensor info to be registered.
-     * @param[in]      mem_info    The auxiliary tensor memory info.
+     * @return ITensorInfo*  The created virtual tensor info object pointer
      */
-    void register_aux_tensor(ITensorInfo &tensor_info, const AuxMemoryInfo &mem_info);
-
-    /** Set a new ID and register the virtual tensor info.
+    ITensorInfo *create_virtual_tensor();
+    /** Create an auxiliary (see @ref MemoryType) tensor info and save it
      *
-     * @param[in, out] tensor_info The tensor info to be registered.
+     * @param[in] tensor_info @ref ITensorInfo to copy from
+     *
+     * @return ITensorInfo*  The created auxiliary tensor info object pointer
      */
-    void register_virtual_tensor(ITensorInfo &tensor_info);
+    ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info);
+
+    /** Get tensor info created by this context, from id */
+    ITensorInfo *get_tensor_info(ITensorInfo::Id id);
+
+    /** Get tensor info created by this context, from id */
+    const ITensorInfo *get_tensor_info(ITensorInfo::Id id) const;
 
 private:
     ITensorInfo::Id next_tensor_id();
 
-    GpuLanguage _gpu_language;
+    GpuLanguage       _gpu_language;
     CLCompileContext *_cl_compile_ctx;
 
-    ITensorInfo::Id _next_tensor_id;
+    ITensorInfo::Id     _next_tensor_id;
     MemoryDescriptorMap _mem_map;
+    std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
index 44c99e844b..d3033898e9 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -26,13 +26,10 @@
 
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
-
-#include <memory>
-#include <vector>
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
 {
@@ -52,9 +49,8 @@ public:
         Context *context)
         : _context{ context },
           _comp_services{},
-          _component_graph{ &_comp_services },
-          _operator_group{},
-          _managed_tensor_info_list{ std::vector<std::unique_ptr<TensorInfo>>() }
+          _component_graph{ _context, &_comp_services },
+          _operator_group{}
     {
     }
     /** Prevent instances of this class from being copy constructed */
@@ -90,10 +86,6 @@ public:
     {
         return _operator_group;
     }
-    ITensorInfo::Id allocate_new_tensor_id()
-    {
-        return ++_next_id;
-    }
     /** Generate @ref GpuWorkloadSourceCode from the workload sketch
      * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
      *
@@ -110,37 +102,29 @@ public:
      */
     ITensorInfo *create_virtual_tensor()
     {
-        auto uptr = std::make_unique<TensorInfo>();
-        _context->implementation().register_virtual_tensor(*uptr);
-        _managed_tensor_info_list.emplace_back(std::move(uptr));
-        return _managed_tensor_info_list.back().get();
+        return _context->implementation().create_virtual_tensor();
     }
     /** Create an auxiliary (see @ref MemoryType) tensor info and save it
      *
-     * @return ITensorInfo*  The created auxiliary tensor info object pointer
-     */
-
-    /** Create an auxiliary (see @ref MemoryType) tensor info and save it
-     *
      * @param[in] tensor_info @ref ITensorInfo to copy from
      *
      * @return ITensorInfo*  The created auxiliary tensor info object pointer
      */
     ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info)
     {
-        auto uptr = std::make_unique<TensorInfo>(tensor_info);
-        _context->implementation().register_aux_tensor(*uptr, AuxMemoryInfo{ uptr->total_size() });
-        _managed_tensor_info_list.emplace_back(std::move(uptr));
-        return _managed_tensor_info_list.back().get();
+        return _context->implementation().create_auxiliary_tensor(tensor_info);
+    }
+
+    ITensorInfo *get_tensor_info(ITensorInfo::Id id)
+    {
+        return _context->implementation().get_tensor_info(id);
     }
 
 private:
-    Context                                 *_context;
-    GpuComponentServices                     _comp_services;
-    GpuKernelComponentGraph                  _component_graph;
-    GpuOperatorGroup                         _operator_group;
-    ITensorInfo::Id                          _next_id{ ITensorInfo::invalid_tensor_id };
-    std::vector<std::unique_ptr<TensorInfo>> _managed_tensor_info_list;
+    Context                *_context;
+    GpuComponentServices    _comp_services;
+    GpuKernelComponentGraph _component_graph;
+    GpuOperatorGroup        _operator_group;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index d1d0bdf77f..578366daaf 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
 {
@@ -34,10 +35,45 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+namespace
+{
+/** Extract kernel arguments of one tensor from a flat list of kernel arguments.
+ *
+ * @param[in] flat_kernel_args
+ * @return GpuKernelArgumentList
+ */
+GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
+{
+    if(flat_kernel_args.empty())
+    {
+        return {};
+    }
+    GpuKernelArgumentList tensor_kargs{};
+
+    const GpuKernelArgumentBinding &karg_head = flat_kernel_args.front();
+    tensor_kargs.push_back(karg_head);
+    flat_kernel_args.pop_front();
+    const auto tensor_id = karg_head.id();
+
+    while(!flat_kernel_args.empty())
+    {
+        const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
+        if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+        {
+            return tensor_kargs;
+        }
+        tensor_kargs.push_back(karg);
+        flat_kernel_args.pop_front();
+    }
+    return tensor_kargs;
+}
+}
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
 
-/** Describes all the info related to a kernel in order to:
+/** Describes all the info related to a **workload argument** (tensor) in order to:
  *  - be used by runtime to configure gpu kernel argument
  *  - be used by memory managers to allocate required memory
  */
@@ -46,6 +82,7 @@ class GpuWorkloadArgument
 public:
     /** Default constructor */
     GpuWorkloadArgument() = default;
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Constructor
      *
      * @param[in] tensor_info     @ref ITensorInfo of the workload argument
@@ -60,6 +97,22 @@ public:
           _kernel_arg_info{ kernel_arg_info }
     {
     }
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Constructor
+     *
+     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_arg_list @ref GpuKernelArgumentList of the workload argument
+     */
+    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
+                        const MemoryDescriptor      &mem_desc,
+                        const GpuKernelArgumentList &kernel_args)
+        : _tensor_info{ tensor_info },
+          _mem_desc{ mem_desc },
+          _kernel_args{ kernel_args }
+    {
+    }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get tensor id within workload */
     ITensorInfo::Id id() const
     {
@@ -85,6 +138,7 @@ public:
     {
         return &_mem_desc;
     }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get @ref GpuKernelArgumentInfo of the argument */
     GpuKernelArgumentInfo *kernel_argument_info()
     {
@@ -95,6 +149,18 @@ public:
     {
         return &_kernel_arg_info;
     }
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Get @ref GpuKernelArgumentList of the workload tensor */
+    GpuKernelArgumentList *kernel_argument_list()
+    {
+        return &_kernel_args;
+    }
+    /** Get @ref GpuKernelArgumentList of the workload tensor */
+    const GpuKernelArgumentList *kernel_argument_list() const
+    {
+        return &_kernel_args;
+    }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Check if the workload argument has valid id
      *
      * @return true   If has valid id
@@ -106,9 +172,13 @@ public:
     }
 
 private:
-    TensorInfo            _tensor_info{};
-    MemoryDescriptor      _mem_desc{};
-    GpuKernelArgumentInfo _kernel_arg_info{};
+    TensorInfo       _tensor_info{};
+    MemoryDescriptor _mem_desc{};
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuKernelArgumentInfo _kernel_arg_info {};
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuKernelArgumentList     _kernel_args {};
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 /** Describes when a unit workload is run.
@@ -179,15 +249,18 @@ public:
      * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
      * @param[in] stage       Stage of the unit workload
      * @param[in] mem_map     @ref MemoryDescriptor map for all tensors within the unit workload
+     * @param[in] context     @ref GpuWorkloadContext associated with the unit workload
      *
      * @return UnitWorkloadId  Allocated unit workload id
      */
-    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map)
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context)
     {
         // Use the size of the kernel codes as Id
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
         const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
         _unit_workloads.push_back(unit_work);
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+        ARM_COMPUTE_UNUSED(context);
         // Assemble kernel argument with memory descriptor to form workload argument
         for(const auto &id_arg : kernel_code.arguments())
         {
@@ -200,6 +273,28 @@ public:
             }
             _tensor_uwork_map[arg_id].insert(uwk_id);
         }
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+        GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
+        GpuKernelArgumentList tensor_kargs{};
+        while(true)
+        {
+            tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
+            if(tensor_kargs.empty())
+            {
+                break;
+            }
+            else
+            {
+                const auto tensor_id           = tensor_kargs.at(0).id();
+                _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs };
+                if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+                {
+                    _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
+                }
+                _tensor_uwork_map[tensor_id].insert(uwk_id);
+            }
+        }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
         return uwk_id;
     }
     /** Get a unit workload from its id */
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index ae67790b4b..28e5432224 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,9 +27,11 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 
 #include <map>
 #include <string>
+#include <vector>
 
 namespace arm_compute
 {
@@ -56,8 +58,20 @@ public:
     virtual std::string get_config_id() = 0;
     /** Generate execution window */
     virtual Window get_window() const = 0;
-    /** Get the kernel argument lists of the kernel*/
-    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0;
+    /** Get the kernel argument lists of the kernel
+     * @deprecated To be removed along with ClTemplateWriter
+     */
+    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors()
+    {
+        return {};
+    }
+#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+    /** Get the flat list of arguments of the kernel*/
+    virtual GpuKernelArgumentList get_kernel_arguments()
+    {
+        return {};
+    }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index d5c03c60c5..d78956f835 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -30,6 +30,7 @@
 #include "arm_compute/core/Window.h"
 #include "src/common/utils/Log.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
@@ -42,29 +43,24 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{ components }
+    : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }
 {
 }
 
 std::string GpuCkwDriver::get_name()
 {
     ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    return "todo_get_name";
+    return "unnamed";
 }
 
 std::string GpuCkwDriver::get_code()
 {
-    ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    ckw::Kernel              kernel(get_name().c_str(), GpuTargetLanguage::OpenCL);
-    GpuCkwKernelWriter       root_writer(kernel);
+    _kernel.name(get_name());
+    GpuCkwKernelWriter       root_writer(_kernel);
     GpuCkwScopedKernelWriter writer(&root_writer);
     GpuCkwVariableTable      vtable{};
 
     // Global Kernel Writer Driver code
-
-    // The following is just an incomplete example of using the kernel writer
-
-    // Iterate over component specific Ckw Driver; generate component code and concatenate them
     for(auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
@@ -96,18 +92,31 @@ Window GpuCkwDriver::get_window() const
     return root_comp->ckw_component_driver()->get_window();
 }
 
-std::map<ITensorInfo::Id, GpuKernelArgument> GpuCkwDriver::get_tensors()
+GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
-    ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    // Assemble GpuKernelArguments
-    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for(const auto t : _components.get_argument_tensors())
+    GpuKernelArgumentList args{};
+    for(const auto &arg : _kernel.arguments())
     {
-        tensors.emplace(
-            t->id(),
-            GpuKernelArgument{ *t, { GpuKernelArgumentInfo::Type::Tensor_Special_0 } });
+        switch(arg.type())
+        {
+            case KernelArgument::Type::TensorStorage:
+            {
+                args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_storage_type()));
+                break;
+            }
+            case KernelArgument::Type::TensorComponent:
+            {
+                args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_component_type()));
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported KernelArgument Type");
+                break;
+            }
+        }
     }
-    return tensors;
+    return args;
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 2084b72098..c6e03f6e03 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -28,6 +28,8 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
+#include "ckw/Kernel.h"
+
 #include <map>
 #include <string>
 
@@ -66,11 +68,12 @@ public:
     std::string get_config_id() override;
     /** Generate execution window */
     Window get_window() const override;
-    /** Get the kernel argument lists of the kernel*/
-    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+    /** Get the flat list of arguments of the kernel*/
+    GpuKernelArgumentList get_kernel_arguments() override;
 
 private:
     GpuKernelComponentGroup _components{};
+    ckw::Kernel             _kernel;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 154968775c..6f3eca711d 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -23,9 +23,10 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 #include <sstream>
 
@@ -35,7 +36,8 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias)
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
+                                                               const std::string &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
@@ -59,7 +61,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name.c_str(), to_ckw(*tensor)) };
+        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) };
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 1c9cb083ea..0649dcba9d 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -24,8 +24,8 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
 #include <map>
 
@@ -37,6 +37,7 @@ namespace dynamic_fusion
 {
 class GpuKernelComponentGroup;
 class GpuCkwScopedKernelWriter;
+enum class TensorStorageType;
 
 /** A table of all the variables used in the kernel.
  *
@@ -52,11 +53,13 @@ public:
      * @param[in] comp_group Component group the tensor belongs to
      * @param[in] writer     Compute Kernel Writer
      * @param[in] tensor     Tensor info with which the new variable is associated
+     * @param[in] storage    Tensor storage type associated with the tensor
      * @param[in] alias      Alias for the variable. Will be used as part of the variable name
      *
      * @return GpuCkwComponentArgument*
      */
-    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias = "unnamed");
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
+                                              const std::string &alias = "unnamed");
 
 private:
     std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index 224c176a31..c07fac0e0d 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -23,14 +23,15 @@
  */
 #include "GpuCkwActivation.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "ckw/TensorTileSampler.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include <string>
@@ -84,8 +85,8 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_
 } // namespace
 
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
-                                                 const ArgumentPack<ITensorInfo> &tensors,
-                                                 const Attributes                &attributes)
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes)
     : IGpuCkwComponentDriver{ id, tensors },
       _src{},
       _dst{},
@@ -102,8 +103,8 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
 
@@ -111,14 +112,14 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp
     auto &dst_tile = dst->tile();
 
     // Constants
-    const auto &constant_minus_1     = writer->declare_tile("minus_1", -1);
-    const auto &constant_pos_1       = writer->declare_tile("one", 1);
-    const auto &constant_zero        = writer->declare_tile("zero", 0);
-    const auto &constant_A           = writer->declare_tile("A_VAL", _attributes.a());
-    const auto &constant_B           = writer->declare_tile("B_VAL", _attributes.b());
+    const auto &constant_minus_1 = writer->declare_tile("minus_1", -1);
+    const auto &constant_pos_1   = writer->declare_tile("one", 1);
+    const auto &constant_zero    = writer->declare_tile("zero", 0);
+    const auto &constant_A       = writer->declare_tile("A_VAL", _attributes.a());
+    const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
 
     // Perform the operation.
-    switch (_attributes.activation())
+    switch(_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
@@ -178,9 +179,9 @@ Window GpuCkwActivation::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index dd71c55df2..8d7e6a8c37 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -23,14 +23,15 @@
  */
 #include "GpuCkwCast.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "ckw/TensorTileSampler.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 #include <string>
@@ -84,8 +85,8 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_
 } // namespace
 
 GpuCkwCast::GpuCkwCast(ComponentId                      id,
-                                                 const ArgumentPack<ITensorInfo> &tensors,
-                                                 const Attributes                &attributes)
+                       const ArgumentPack<ITensorInfo> &tensors,
+                       const Attributes                &attributes)
     : IGpuCkwComponentDriver{ id, tensors },
       _src{},
       _dst{},
@@ -102,8 +103,8 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the source tile and prepare the sampler.
     if(!src->has_tile())
@@ -124,7 +125,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa
     if(!dst->has_tile())
     {
         // Get Target datatype and convert it to ckw::DataType.
-        ckw::DataType target_dt =  dynamic_fusion::to_ckw(_attributes.data_type());
+        ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
 
         // Create dst_tile based on src_tile dimensions and with target DataType.
         const TileInfo src_tile_info = src_tile.tile_info();
@@ -166,9 +167,9 @@ Window GpuCkwCast::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index 685bf391dc..15e32e26d5 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -23,14 +23,16 @@
  */
 #include "GpuCkwElementwiseBinary.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/types/TensorSamplerTypes.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include <string>
@@ -54,14 +56,20 @@ inline TensorTileSampler create_simple_sampler(GpuCkwScopedKernelWriter &writer,
     auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
     auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
 
-    auto &const_0 = writer->declare_tile("0", 0);
-
     writer->op_get_global_id(gid_0, 0);
     writer->op_get_global_id(gid_1, 1);
     writer->op_get_global_id(gid_2, 2);
 
-    sampler.x(gid_0);
-    sampler.y(gid_1);
+    auto &x_coord = writer->declare_tile("x_coord", ckw::DataType::Int32);
+    auto &y_coord = writer->declare_tile("y_coord", ckw::DataType::Int32);
+    auto &m0_t    = writer->declare_tile("m0", m0);
+    auto &n0_t    = writer->declare_tile("n0", n0);
+    writer->op_binary_expression(x_coord, gid_0, ckw::BinaryOp::Mul, n0_t);
+    writer->op_binary_expression(y_coord, gid_1, ckw::BinaryOp::Mul, m0_t);
+
+    sampler.x(x_coord);
+    sampler.y(y_coord);
+    auto &const_0 = writer->declare_tile("0", 0);
     sampler.z(const_0); // 3rd dimension collapsed with 2nd dimension
     sampler.b(gid_2);
 
@@ -99,9 +107,9 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_gr
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
-    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the LHS and RHS tiles and prepare the tensor sampler.
     load_lhs_rhs_tiles_and_prepare_sampler(writer, lhs, rhs, m0, n0, create_simple_sampler);
@@ -131,10 +139,9 @@ Window GpuCkwElementwiseBinary::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    // constexpr unsigned int vector_size_byte_opencl = 16;
-    // const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    const unsigned int num_elems_processed_per_iteration = 1U; // Hard-coded for now
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl           = 16;
+    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 63555e6064..247d1b834f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -24,6 +24,7 @@
 #include "GpuCkwStore.h"
 
 #include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
@@ -43,8 +44,8 @@ GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensor
 }
 void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
 {
-    auto src = vtable.declare_variable(comp_group, writer, _src, "src");
-    auto dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     auto       &src_tile   = src->tile();
     const auto &sampler    = src->tile_sampler();
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h
index 9027bddd76..8a38d67d80 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h
@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "ckw/TensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 namespace arm_compute
 {
@@ -98,6 +99,103 @@ inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
         tensor_info.id()
     };
 }
+
+inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+    switch(component)
+    {
+        case ckw::TensorComponentType::OffsetFirstElement:
+            return TensorComponentType::OffsetFirstElement;
+            break;
+        case ckw::TensorComponentType::Stride0:
+            return TensorComponentType::Stride0;
+            break;
+        case ckw::TensorComponentType::Stride1:
+            return TensorComponentType::Stride1;
+            break;
+        case ckw::TensorComponentType::Stride2:
+            return TensorComponentType::Stride2;
+            break;
+        case ckw::TensorComponentType::Stride3:
+            return TensorComponentType::Stride3;
+            break;
+        case ckw::TensorComponentType::Stride4:
+            return TensorComponentType::Stride4;
+            break;
+        case ckw::TensorComponentType::Dim0:
+            return TensorComponentType::Dim0;
+            break;
+        case ckw::TensorComponentType::Dim1:
+            return TensorComponentType::Dim1;
+            break;
+        case ckw::TensorComponentType::Dim2:
+            return TensorComponentType::Dim2;
+            break;
+        case ckw::TensorComponentType::Dim3:
+            return TensorComponentType::Dim3;
+            break;
+        case ckw::TensorComponentType::Dim4:
+            return TensorComponentType::Dim4;
+            break;
+        case ckw::TensorComponentType::Dim1xDim2:
+            return TensorComponentType::Dim1xDim2;
+            break;
+        case ckw::TensorComponentType::Dim2xDim3:
+            return TensorComponentType::Dim2xDim3;
+            break;
+        case ckw::TensorComponentType::Dim1xDim2xDim3:
+            return TensorComponentType::Dim1xDim2xDim3;
+            break;
+        case ckw::TensorComponentType::Unknown:
+            return TensorComponentType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+            return TensorComponentType::Unknown;
+    }
+}
+
+inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+    switch(storage)
+    {
+        case TensorStorageType::ClBufferUint8Ptr:
+            return ckw::TensorStorageType::BufferUint8Ptr;
+            break;
+        case TensorStorageType::ClImage2dReadOnly:
+            return ckw::TensorStorageType::Texture2dReadOnly;
+            break;
+        case TensorStorageType::ClImage2dWriteOnly:
+            return ckw::TensorStorageType::Texture2dWriteOnly;
+            break;
+        case TensorStorageType::Unknown:
+            return ckw::TensorStorageType::Unknown;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor storage type");
+            return ckw::TensorStorageType::Unknown;
+    }
+}
+inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+    switch(storage)
+    {
+        case ckw::TensorStorageType::BufferUint8Ptr:
+            return TensorStorageType::ClBufferUint8Ptr;
+            break;
+        case ckw::TensorStorageType::Texture2dReadOnly:
+            return TensorStorageType::ClImage2dReadOnly;
+            break;
+        case ckw::TensorStorageType::Texture2dWriteOnly:
+            return TensorStorageType::ClImage2dWriteOnly;
+            break;
+        case ckw::TensorStorageType::Unknown:
+            return TensorStorageType::Unknown;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+            return TensorStorageType::Unknown;
+    }
+}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index d600956b4f..af766a7ece 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -104,9 +104,11 @@ public:
     {
         return _properties;
     }
-    /** Get template writer for the component */
-    virtual const IGpuTemplateComponentWriter *template_writer() const = 0;
-    /** Get compute kernel writer driver for the component */
+    /** Get writer for the component */
+    virtual const IGpuTemplateComponentWriter *template_writer() const
+    {
+        return nullptr;
+    }
     virtual const IGpuCkwComponentDriver *ckw_component_driver() const
     {
         return nullptr;
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index d2cde40a10..c41257d18c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -24,8 +24,11 @@
 #include "ClComponentActivation.h"
 
 #include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -66,8 +69,17 @@ ClComponentActivation::ClComponentActivation(ComponentId
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
     : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateActivation>(id, tensors, attributes) },
-      _ckw_driver{ std::make_unique<GpuCkwActivation>(id, tensors, attributes) }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<ClTemplateActivation>(id, tensors, attributes)
+}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<GpuCkwActivation>(id, tensors, attributes)
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -75,15 +87,15 @@ ClComponentActivation::~ClComponentActivation()
 {
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentActivation::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
 
-const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
-{
-    return _ckw_driver.get();
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index bb6f7c6e30..ebe8719420 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -42,8 +42,11 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateActivation;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwActivation;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentActivation final : public IGpuKernelComponent
 {
@@ -106,10 +109,12 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentActivation &operator=(ClComponentActivation &&component) = default;
 
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver     *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -118,8 +123,11 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateActivation> _component_writer;
-    std::unique_ptr<GpuCkwActivation>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwActivation> _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index 92933ae7a5..635869f817 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -26,8 +26,11 @@
 #include "arm_compute/core/Error.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -67,23 +70,32 @@ ClComponentCast::ClComponentCast(ComponentId                      id,
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
     : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateCast>(id, tensors, attributes) },
-      _ckw_driver{ std::make_unique<GpuCkwCast>(id, tensors, attributes) }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<ClTemplateCast>(id, tensors, attributes)
+}
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
+{
+    std::make_unique<GpuCkwCast>(id, tensors, attributes)
+}
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
 }
 ClComponentCast::~ClComponentCast()
 {
 }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentCast::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
 
-const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
-{
-    return _ckw_driver.get();
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index 174f9670b3..37b8cbb6c9 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -48,8 +48,11 @@ private:
 };
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateCast;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwCast;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentCast final : public IGpuKernelComponent
 {
@@ -116,10 +119,12 @@ public:
     ClComponentCast(ClComponentCast &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentCast &operator=(ClComponentCast &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-    /** Get GPU kernel writer for the component */
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -127,8 +132,11 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateCast> _component_writer;
-    std::unique_ptr<GpuCkwCast>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwCast>   _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 52739e23c0..88d729170c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -25,8 +25,11 @@
 
 #include "arm_compute/core/Validate.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -106,28 +109,38 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &t
     return Status{};
 }
 
+ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
+{
+}
 ClComponentElementwiseBinary::ClComponentElementwiseBinary(
     ComponentId                      id,
     const Properties                &properties,
     const ArgumentPack<ITensorInfo> &tensors,
     const Attributes                &attributes)
     : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes) },
-      _ckw_driver{ std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes) }
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
 {
+    std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)
 }
-ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
 {
+    std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)
 }
-const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
-    return _component_writer.get();
 }
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
-    return _ckw_driver.get();
+    return _component_writer.get();
 }
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index a56dd8b37d..f7175903d0 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -40,8 +40,11 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateElementwiseBinary;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwElementwiseBinary;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentElementwiseBinary final : public IGpuKernelComponent
 {
@@ -101,10 +104,13 @@ public:
     ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver            *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -112,10 +118,13 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateElementwiseBinary> _component_writer;
-    std::unique_ptr<GpuCkwElementwiseBinary>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwElementwiseBinary> _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index a3283b1866..12b81c3d56 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -24,8 +24,11 @@
 #include "ClComponentStore.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+#else //ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include <memory>
 
@@ -43,19 +46,30 @@ Status ClComponentStore::validate(
     return Status{};
 }
 ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) }, _ckw_driver{ std::make_unique<GpuCkwStore>(id, tensors) }
+    : IGpuKernelComponent{ id, properties, tensors },
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
 {
+    std::make_unique<ClTemplateStore>(id, tensors)
 }
-ClComponentStore::~ClComponentStore()
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer
 {
+    std::make_unique<GpuCkwStore>(id, tensors)
 }
-const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
-    return _component_writer.get();
 }
+ClComponentStore::~ClComponentStore()
+{
+}
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
-    return _ckw_driver.get();
+    return _component_writer.get();
 }
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index f168ccb97e..853ee39012 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -25,7 +25,6 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
 #include <memory>
 
 namespace arm_compute
@@ -39,7 +38,11 @@ namespace dynamic_fusion
 /** Forward declaration */
 template <typename T>
 class ArgumentPack;
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
+class ClTemplateStore;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwStore;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentStore final : public IGpuKernelComponent
 {
@@ -85,10 +88,12 @@ public:
     ClComponentStore(ClComponentStore &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentStore &operator=(ClComponentStore &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -96,8 +101,11 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateStore> _component_writer;
-    std::unique_ptr<GpuCkwStore>     _ckw_driver;
+#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwStore>  _component_writer;
+#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
index 7ab7c8a3fc..71b0114225 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -432,3 +433,5 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index f27a1796c9..5ab1fafe2f 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/framework/Fixture.h"
@@ -249,3 +250,5 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
index f4478db42b..7f5efd662a 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -132,7 +133,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuPool2dFixture<float>, framework
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunSpecial, DFSpecialGpuPool2dFixture<float>, framework::DatasetMode::ALL, combine(datasets::PoolingLayerDatasetSpecialDynamicFusion(),
-                                                                                                                  framework::dataset::make("DataType", DataType::F32)))
+                                                                                                          framework::dataset::make("DataType", DataType::F32)))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -231,3 +232,5 @@ TEST_SUITE_END() // CL
 }
 }
 }
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
index bdaa1be531..4d038b2780 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ReshapeLayerDataset.h"
 #include "tests/framework/Macros.h"
@@ -121,3 +122,5 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
index 5f99cd6d78..9ca1c5f0da 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Resize.cpp
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -517,3 +518,5 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
index e8314d700d..340f5dc2a3 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "arm_compute/core/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
 
@@ -196,3 +197,5 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
+
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
author	SiCong Li <sicong.li@arm.com>	2023-06-28 09:49:45 +0100
committer	SiCong Li <sicong.li@arm.com>	2023-07-25 15:48:50 +0000
commit	23882a9014eb3972bca958206866c8e0d0b829cc (patch)
tree	9139b91699099160e26a64abd8cf182bd7447278
parent	0a59e69fd922b02d9e3b5b043ee7f891061df7be (diff)
download	ComputeLibrary-23882a9014eb3972bca958206866c8e0d0b829cc.tar.gz