From ce3c48c7af02555f81c0f5e7ef2677916cecef34 Mon Sep 17 00:00:00 2001
From: Viet-Hoa Do <viet-hoa.do@arm.com>
Date: Mon, 3 Jul 2023 13:44:43 +0100
Subject: Move CKW prototype to separate directory

Partially resolves: COMPMID-6283
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I7596e3dc357d6f0b9cbe66534523943a73c26d81
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9864
Reviewed-by: SiCong Li <sicong.li@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |   33 +-
 SConscript                                         |    2 +-
 SConstruct                                         |    5 +-
 compute_kernel_writer/CMakeLists.txt               |   26 +-
 compute_kernel_writer/README.md                    |    1 -
 compute_kernel_writer/examples/add_exp_store.cpp   |  181 -
 .../include/acl/AclComponentArgument.h             |  111 -
 .../include/acl/AclKernelWriter.h                  |   56 -
 .../include/acl/AclScopedKernelWriter.h            |   62 -
 compute_kernel_writer/include/ckw/Error.h          |   66 +-
 compute_kernel_writer/include/ckw/Kernel.h         |   77 -
 compute_kernel_writer/include/ckw/KernelWriter.h   |  217 --
 compute_kernel_writer/include/ckw/OperandBase.h    |   76 -
 compute_kernel_writer/include/ckw/ScalarValue.h    |  137 -
 compute_kernel_writer/include/ckw/TensorOperand.h  |  181 -
 .../include/ckw/TensorTileSampler.h                |  163 -
 compute_kernel_writer/include/ckw/TileOperand.h    |  110 -
 compute_kernel_writer/include/ckw/Types.h          |  119 +-
 compute_kernel_writer/prototype/CMakeLists.txt     |   72 +
 .../prototype/examples/add_exp_store.cpp           |  181 +
 .../examples/common/ExampleComponentArgument.cpp   |   97 +
 .../examples/common/ExampleComponentArgument.h     |  111 +
 .../examples/common/ExampleKernelWriter.cpp        |   50 +
 .../examples/common/ExampleKernelWriter.h          |   56 +
 .../examples/common/ExampleScopedKernelWriter.cpp  |   58 +
 .../examples/common/ExampleScopedKernelWriter.h    |   62 +
 .../prototype/include/ckw/Error.h                  |   79 +
 .../prototype/include/ckw/Kernel.h                 |   77 +
 .../prototype/include/ckw/KernelWriter.h           |  217 ++
 .../prototype/include/ckw/OperandBase.h            |   76 +
 .../prototype/include/ckw/ScalarValue.h            |  137 +
 .../prototype/include/ckw/TensorInfo.h             |  145 +
 .../prototype/include/ckw/TensorOperand.h          |  181 +
 .../prototype/include/ckw/TensorTileSampler.h      |  163 +
 .../prototype/include/ckw/TileInfo.h               |   84 +
 .../prototype/include/ckw/TileOperand.h            |  110 +
 .../prototype/include/ckw/Types.h                  |  140 +
 compute_kernel_writer/prototype/src/Kernel.cpp     |   61 +
 .../prototype/src/KernelWriter.cpp                 |  227 ++
 .../prototype/src/OperandBase.cpp                  |   50 +
 compute_kernel_writer/prototype/src/Prototype.h    | 3767 ++++++++++++++++++++
 compute_kernel_writer/prototype/src/TensorInfo.cpp |   77 +
 .../prototype/src/TensorOperand.cpp                |  247 ++
 .../prototype/src/TensorTileSampler.cpp            |  167 +
 compute_kernel_writer/prototype/src/TileInfo.cpp   |   76 +
 .../prototype/src/TileOperand.cpp                  |  104 +
 compute_kernel_writer/src/Kernel.cpp               |   61 -
 compute_kernel_writer/src/KernelWriter.cpp         |  227 --
 compute_kernel_writer/src/OperandBase.cpp          |   50 -
 compute_kernel_writer/src/Prototype.h              | 3767 --------------------
 compute_kernel_writer/src/TensorOperand.cpp        |  247 --
 compute_kernel_writer/src/TensorTileSampler.cpp    |  167 -
 compute_kernel_writer/src/TileOperand.cpp          |  104 -
 .../src/acl/AclComponentArgument.cpp               |   97 -
 compute_kernel_writer/src/acl/AclKernelWriter.cpp  |   50 -
 .../src/acl/AclScopedKernelWriter.cpp              |   58 -
 filelist.json                                      |    3 +
 scripts/clang_tidy_rules.py                        |    2 +-
 .../gpu/ckw_driver/GpuCkwComponentArgument.cpp     |  108 +
 .../gpu/ckw_driver/GpuCkwComponentArgument.h       |  122 +
 .../sketch/gpu/ckw_driver/GpuCkwDriver.cpp         |   12 +-
 .../sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp   |   61 +
 .../sketch/gpu/ckw_driver/GpuCkwKernelWriter.h     |   67 +
 .../gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp    |   69 +
 .../gpu/ckw_driver/GpuCkwScopedKernelWriter.h      |   73 +
 .../sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp  |   16 +-
 .../sketch/gpu/ckw_driver/GpuCkwVariableTable.h    |   11 +-
 .../sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h |    5 +-
 .../components/GpuCkwElementwiseBinary.cpp         |   23 +-
 .../components/GpuCkwElementwiseBinary.h           |    3 +-
 .../gpu/ckw_driver/components/GpuCkwStore.cpp      |    6 +-
 .../sketch/gpu/ckw_driver/components/GpuCkwStore.h |    2 +-
 .../gpu/ckw_driver/components/utils/WriterHelper.h |    8 +-
 73 files changed, 7462 insertions(+), 6452 deletions(-)
 delete mode 100644 compute_kernel_writer/examples/add_exp_store.cpp
 delete mode 100644 compute_kernel_writer/include/acl/AclComponentArgument.h
 delete mode 100644 compute_kernel_writer/include/acl/AclKernelWriter.h
 delete mode 100644 compute_kernel_writer/include/acl/AclScopedKernelWriter.h
 delete mode 100644 compute_kernel_writer/include/ckw/Kernel.h
 delete mode 100644 compute_kernel_writer/include/ckw/KernelWriter.h
 delete mode 100644 compute_kernel_writer/include/ckw/OperandBase.h
 delete mode 100644 compute_kernel_writer/include/ckw/ScalarValue.h
 delete mode 100644 compute_kernel_writer/include/ckw/TensorOperand.h
 delete mode 100644 compute_kernel_writer/include/ckw/TensorTileSampler.h
 delete mode 100644 compute_kernel_writer/include/ckw/TileOperand.h
 create mode 100644 compute_kernel_writer/prototype/CMakeLists.txt
 create mode 100644 compute_kernel_writer/prototype/examples/add_exp_store.cpp
 create mode 100644 compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
 create mode 100644 compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
 create mode 100644 compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
 create mode 100644 compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
 create mode 100644 compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
 create mode 100644 compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/Error.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/Kernel.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/KernelWriter.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/OperandBase.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/ScalarValue.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/TensorInfo.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/TensorOperand.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/TileInfo.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/TileOperand.h
 create mode 100644 compute_kernel_writer/prototype/include/ckw/Types.h
 create mode 100644 compute_kernel_writer/prototype/src/Kernel.cpp
 create mode 100644 compute_kernel_writer/prototype/src/KernelWriter.cpp
 create mode 100644 compute_kernel_writer/prototype/src/OperandBase.cpp
 create mode 100644 compute_kernel_writer/prototype/src/Prototype.h
 create mode 100644 compute_kernel_writer/prototype/src/TensorInfo.cpp
 create mode 100644 compute_kernel_writer/prototype/src/TensorOperand.cpp
 create mode 100644 compute_kernel_writer/prototype/src/TensorTileSampler.cpp
 create mode 100644 compute_kernel_writer/prototype/src/TileInfo.cpp
 create mode 100644 compute_kernel_writer/prototype/src/TileOperand.cpp
 delete mode 100644 compute_kernel_writer/src/Kernel.cpp
 delete mode 100644 compute_kernel_writer/src/KernelWriter.cpp
 delete mode 100644 compute_kernel_writer/src/OperandBase.cpp
 delete mode 100644 compute_kernel_writer/src/Prototype.h
 delete mode 100644 compute_kernel_writer/src/TensorOperand.cpp
 delete mode 100644 compute_kernel_writer/src/TensorTileSampler.cpp
 delete mode 100644 compute_kernel_writer/src/TileOperand.cpp
 delete mode 100644 compute_kernel_writer/src/acl/AclComponentArgument.cpp
 delete mode 100644 compute_kernel_writer/src/acl/AclKernelWriter.cpp
 delete mode 100644 compute_kernel_writer/src/acl/AclScopedKernelWriter.cpp
 create mode 100644 src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
 create mode 100644 src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
 create mode 100644 src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
 create mode 100644 src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h
 create mode 100644 src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
 create mode 100644 src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h

diff --git a/Android.bp b/Android.bp
index ccf513a137..fd6ff3b4d3 100644
--- a/Android.bp
+++ b/Android.bp
@@ -178,8 +178,8 @@ cc_library_static {
     proprietary: true,
     local_include_dirs: ["build/android-arm64v8a/src/core",
                          "build/android-arm64v8a/src/core/CL",
-                         "compute_kernel_writer/include",
-                         "compute_kernel_writer",
+                         "compute_kernel_writer/prototype/include",
+                         "compute_kernel_writer/prototype",
                          "src/core/common",
                          "src/core/helpers",
                          "src/core/NEON/kernels/arm_gemm",
@@ -189,24 +189,14 @@ cc_library_static {
                          "src/cpu/kernels/assembly"],
     export_include_dirs: [".", "./include"],
     srcs: [
-        "compute_kernel_writer/src/Error.cpp",
-        "compute_kernel_writer/src/Helpers.cpp",
-        "compute_kernel_writer/src/Kernel.cpp",
-        "compute_kernel_writer/src/KernelWriter.cpp",
-        "compute_kernel_writer/src/OperandBase.cpp",
-        "compute_kernel_writer/src/TensorInfo.cpp",
-        "compute_kernel_writer/src/TensorOperand.cpp",
-        "compute_kernel_writer/src/TensorTileSampler.cpp",
-        "compute_kernel_writer/src/TensorUtils.cpp",
-        "compute_kernel_writer/src/TileInfo.cpp",
-        "compute_kernel_writer/src/TileOperand.cpp",
-        "compute_kernel_writer/src/acl/AclComponentArgument.cpp",
-        "compute_kernel_writer/src/acl/AclKernelWriter.cpp",
-        "compute_kernel_writer/src/acl/AclScopedKernelWriter.cpp",
-        "compute_kernel_writer/src/cl/CLConstantTile.cpp",
-        "compute_kernel_writer/src/cl/CLHelpers.cpp",
-        "compute_kernel_writer/src/cl/CLTile.cpp",
-        "compute_kernel_writer/src/cl/ICLTile.cpp",
+        "compute_kernel_writer/prototype/src/Kernel.cpp",
+        "compute_kernel_writer/prototype/src/KernelWriter.cpp",
+        "compute_kernel_writer/prototype/src/OperandBase.cpp",
+        "compute_kernel_writer/prototype/src/TensorInfo.cpp",
+        "compute_kernel_writer/prototype/src/TensorOperand.cpp",
+        "compute_kernel_writer/prototype/src/TensorTileSampler.cpp",
+        "compute_kernel_writer/prototype/src/TileInfo.cpp",
+        "compute_kernel_writer/prototype/src/TileOperand.cpp",
         "src/c/AclContext.cpp",
         "src/c/AclOperator.cpp",
         "src/c/AclQueue.cpp",
@@ -642,7 +632,10 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
diff --git a/SConscript b/SConscript
index 9ddffc4273..5504d48e07 100644
--- a/SConscript
+++ b/SConscript
@@ -128,7 +128,7 @@ def build_library(name, build_env, sources, static=False, libs=[]):
         cloned_build_env["LINKFLAGS"].remove('-static-libstdc++')
 
     if env['experimental_dynamic_fusion']:
-        libs.append('libckw.a')
+        libs.append('libckw_prototype.a')
 
     if static:
         obj = cloned_build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
diff --git a/SConstruct b/SConstruct
index 4994972172..4ab9fdd66a 100644
--- a/SConstruct
+++ b/SConstruct
@@ -160,7 +160,7 @@ install_path = env['install_dir']
 if not env['install_dir'].startswith('/') and install_path != "":
     install_path = "%s/%s" % (build_path, install_path)
 
-env.Append(LIBPATH = [build_path])
+env.Append(LIBPATH = [build_path, os.path.join(build_path, "prototype")])
 Export('env')
 Export('vars')
 
@@ -432,7 +432,7 @@ if env['experimental_dynamic_fusion']:
     CKW_ENABLE_ASSERTS = env['debug'] or env['asserts']
 
     CKW_PROJECT_DIR = Dir('.').path + "/compute_kernel_writer"
-    CKW_INCLUDE_DIR = CKW_PROJECT_DIR + "/include"
+    CKW_INCLUDE_DIR = CKW_PROJECT_DIR + "/prototype/include"
     CKW_BUILD_DIR = build_path.replace("#", "")
 
     CKW_CMAKE_CMD = "CC={CKW_CC} CXX={CKW_CXX} cmake -G \"Unix Makefiles\" " \
@@ -440,6 +440,7 @@ if env['experimental_dynamic_fusion']:
                     "-DCMAKE_BUILD_TYPE={CKW_BUILD_TYPE} " \
                     "-DCKW_ENABLE_OPENCL={CKW_ENABLE_OPENCL} " \
                     "-DCKW_ENABLE_ASSERTS={CKW_ENABLE_ASSERTS} " \
+                    "-DCKW_BUILD_PROTOTYPE=ON " \
                     "-DCKW_CCACHE={CKW_CCACHE} ".format(CKW_CC=CKW_CC,
                                                         CKW_CXX=CKW_CXX,
                                                         CKW_PROJECT_DIR=CKW_PROJECT_DIR,
diff --git a/compute_kernel_writer/CMakeLists.txt b/compute_kernel_writer/CMakeLists.txt
index 2c770e4efb..186ab328f5 100644
--- a/compute_kernel_writer/CMakeLists.txt
+++ b/compute_kernel_writer/CMakeLists.txt
@@ -43,7 +43,6 @@ message(STATUS "${CMAKE_PROJECT_NAME} ${CMAKE_PROJECT_VERSION}")
 option(CKW_ENABLE_OPENCL "Enable OpenCL code generation" OFF)
 option(CKW_ENABLE_ASSERTS "Enable assertions. Always enabled in Debug builds" OFF)
 option(CKW_BUILD_TESTING "Build the Compute Kernel Writer validation test suite" OFF)
-option(CKW_BUILD_EXAMPLES "Build the Compute Kernel Writer examples" OFF)
 option(CKW_CCACHE "Use compiler cache for faster recompilation" OFF)
 
 #---------------------------------------------------------------------
@@ -122,17 +121,6 @@ target_sources(ckw PRIVATE
     src/TensorInfo.cpp
     src/TensorUtils.cpp
     src/TileInfo.cpp
-
-    src/Kernel.cpp
-    src/KernelWriter.cpp
-    src/OperandBase.cpp
-    src/TileOperand.cpp
-    src/TensorOperand.cpp
-    src/TensorTileSampler.cpp
-
-    src/acl/AclKernelWriter.cpp
-    src/acl/AclScopedKernelWriter.cpp
-    src/acl/AclComponentArgument.cpp
 )
 
 if(CKW_ENABLE_OPENCL)
@@ -147,7 +135,7 @@ endif()
 target_include_directories(ckw
     PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include
     PRIVATE ${CMAKE_CURRENT_LIST_DIR}
-    )
+)
 
 #---------------------------------------------------------------------
 # Validation tests
@@ -172,16 +160,10 @@ if(CKW_BUILD_TESTING)
 endif()
 
 #---------------------------------------------------------------------
-# Examples
-
-function(add_ckw_example name)
-    add_executable(${name} ${ARGN})
-    target_link_libraries(${name} PUBLIC ckw)
-endfunction(add_ckw_example)
+# Prototype
 
-if(CKW_BUILD_EXAMPLES)
-    add_ckw_example(ckw_example_add_exp_store
-        examples/add_exp_store.cpp)
+if(CKW_BUILD_PROTOTYPE)
+    add_subdirectory(prototype)
 endif()
 
 #---------------------------------------------------------------------
diff --git a/compute_kernel_writer/README.md b/compute_kernel_writer/README.md
index 650eae2b48..8a24fe20ec 100644
--- a/compute_kernel_writer/README.md
+++ b/compute_kernel_writer/README.md
@@ -82,6 +82,5 @@ This project can be configured with the following build options. Enable options
 | CKW_ENABLE_OPENCL    | Enable OpenCL code generation.                                                                                                            |
 | CKW_ENABLE_ASSERTS   | Enable assertions. Always enabled for Debug builds.                                                                                       |
 | CKW_BUILD_TESTING    | Build the validation test suite.                                                                                                          |
-| CKW_BUILD_EXAMPLES   | Build the examples.                                                                                                                       |
 | CKW_CCACHE           | Use compiler cache for faster recompilation.                                                                                              |
 | CMAKE_TOOLCHAIN_FILE | When cross-compiling, set this variable to the path of the CMake toolchain file.                                                          |
diff --git a/compute_kernel_writer/examples/add_exp_store.cpp b/compute_kernel_writer/examples/add_exp_store.cpp
deleted file mode 100644
index ab2f3e15fb..0000000000
--- a/compute_kernel_writer/examples/add_exp_store.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/Error.h"
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-#include "ckw/Types.h"
-
-#include "acl/AclComponentArgument.h"
-#include "acl/AclKernelWriter.h"
-#include "acl/AclScopedKernelWriter.h"
-
-#include <iostream>
-#include <vector>
-
-using namespace ckw;
-
-TensorTileSampler create_simple_sampler(AclScopedKernelWriter writer)
-{
-    TensorTileSampler sampler;
-
-    constexpr int32_t m0 = 4;
-    constexpr int32_t n0 = 4;
-
-    auto &gid_0 = writer->declare_tile("gid_0", DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    sampler.x(gid_0);
-    sampler.y(gid_1);
-    sampler.z(gid_2);
-    sampler.b(const_0);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1);
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-
-    return sampler;
-}
-
-void op_binary_elementwise(AclScopedKernelWriter writer, std::vector<AclComponentArgument *> operands)
-{
-    auto lhs = operands.at(0);
-    auto rhs = operands.at(1);
-    auto dst = operands.at(2);
-
-    // Load the LHS and RHS tile and prepare the tensor sampler.
-    if(!lhs->has_tile() && !rhs->has_tile())
-    {
-        const auto sampler = create_simple_sampler(writer);
-
-        writer->op_load_once(lhs, sampler);
-        writer->op_load_once(rhs, sampler);
-    }
-    else if(lhs->has_tile())
-    {
-        const auto &sampler = lhs->tile_sampler();
-        writer->op_load_once(rhs, sampler);
-    }
-    else
-    {
-        const auto &sampler = rhs->tile_sampler();
-        writer->op_load_once(lhs, sampler);
-    }
-
-    auto       &lhs_tile = lhs->tile();
-    auto       &rhs_tile = rhs->tile();
-    const auto &sampler  = lhs->tile_sampler();
-
-    // Prepare the output tile.
-    if(!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", lhs_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-
-    auto &dst_tile = dst->tile();
-
-    // Perform the operation.
-    writer->op_binary_expression(dst_tile, lhs_tile, rhs_tile, BinaryOp::Add);
-}
-
-void op_exp(AclScopedKernelWriter writer, std::vector<AclComponentArgument *> operands)
-{
-    auto src = operands.at(0);
-    auto dst = operands.at(1);
-
-    // Load the source tile and prepare the sampler.
-    if(!src->has_tile())
-    {
-        const auto sampler = create_simple_sampler(writer);
-        writer->op_load_once(src, sampler);
-    }
-
-    auto       &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
-
-    // Prepare the output tile.
-    if(!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-
-    auto &dst_tile = dst->tile();
-
-    // Perform the operation.
-    writer->op_scalar_function(dst_tile, src_tile, ScalarUnaryFunction::Exp);
-}
-
-void op_store(AclScopedKernelWriter writer, std::vector<AclComponentArgument *> operands)
-{
-    auto src = operands.at(0);
-    auto dst = operands.at(1);
-
-    auto       &src_tile   = src->tile();
-    const auto &sampler    = src->tile_sampler();
-    auto       &dst_tensor = dst->tensor();
-
-    writer->op_store(dst_tensor, src_tile, sampler);
-}
-
-int main()
-{
-    Kernel          kernel("example", GpuTargetLanguage::OpenCL);
-    AclKernelWriter root_writer(kernel);
-
-    AclScopedKernelWriter writer(&root_writer);
-
-    const TensorInfo src0_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 0);
-    const TensorInfo src1_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 1);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 2);
-
-    AclComponentArgument src0(writer->create_tensor_argument("src0", src0_info));
-    AclComponentArgument src1(writer->create_tensor_argument("src1", src1_info));
-    AclComponentArgument dst(writer->create_tensor_argument("dst", dst_info));
-
-    AclComponentArgument ans;
-
-    op_binary_elementwise(writer, { &src0, &src1, &ans });
-    op_exp(writer, { &ans, &ans });
-    op_store(writer, { &ans, &dst });
-
-    const auto code = root_writer.generate_code();
-    std::cout << code;
-
-    return 0;
-}
diff --git a/compute_kernel_writer/include/acl/AclComponentArgument.h b/compute_kernel_writer/include/acl/AclComponentArgument.h
deleted file mode 100644
index 485b7a30bc..0000000000
--- a/compute_kernel_writer/include/acl/AclComponentArgument.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_ACL_ACLCOMPONENTARGUMENT_H
-#define CKW_INCLUDE_ACL_ACLCOMPONENTARGUMENT_H
-
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class TensorOperand;
-class TileOperand;
-} // namespace ckw
-
-/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
-class AclComponentArgument
-{
-public:
-    /** Initialize a new instance of @ref AclComponentArgument class for empty virtual tensor. */
-    AclComponentArgument();
-
-    /** Initialize a new instance of @ref AclComponentArgument class for user tensor.
-     *
-     * @param[in] tensor The user tensor.
-     */
-    explicit AclComponentArgument(ckw::TensorOperand &tensor);
-
-    /** Set virtual tensor information (tile, sampler) for the argument.
-     *
-     * If the component is a user tensor, it can be treated as virtual tensor as well
-     * and won't be loaded again using @ref AclKernelWriter::op_load_once method.
-     *
-     * @param[in] tile    The tile that has been loaded.
-     * @param[in] sampler The tensor sampling information that has been used to load the tile.
-     */
-    AclComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
-
-    /** Get whether the argument is a user tensor. */
-    bool has_tensor() const;
-
-    /** Get the tensor operand.
-     *
-     * If the tensor is not available, throw an error.
-     */
-    ckw::TensorOperand &tensor();
-
-    /** Get the tensor operand.
-     *
-     * If the tensor is not available, throw an error.
-     */
-    const ckw::TensorOperand &tensor() const;
-
-    /** Get whether the argument contains a tile.
-     *
-     * The argument can be either a user tensor that has been loaded,
-     * or a virtual tensor (i.e. a tile with tensor sampling information).
-     */
-    bool has_tile() const;
-
-    /** Get the tile operand.
-     *
-     * If the tile is not available, throw an error.
-     */
-    ckw::TileOperand &tile();
-
-    /** Get the tile operand.
-     *
-     * If the tile is not available, throw an error.
-     */
-    const ckw::TileOperand &tile() const;
-
-    /** Get the tensor sampling information for the tile.
-     *
-     * If the tile is not available, throw an error.
-     */
-    ckw::TensorTileSampler &tile_sampler();
-
-    /** Get the tensor sampling information for the tile.
-     *
-     * If the tile is not available, throw an error.
-     */
-    const ckw::TensorTileSampler &tile_sampler() const;
-
-private:
-    ckw::TensorOperand *_tensor{ nullptr };
-    ckw::TileOperand   *_tile{ nullptr };
-    ckw::TensorTileSampler  _tile_sampler{};
-};
-
-#endif // CKW_INCLUDE_ACL_ACLCOMPONENTARGUMENT_H
diff --git a/compute_kernel_writer/include/acl/AclKernelWriter.h b/compute_kernel_writer/include/acl/AclKernelWriter.h
deleted file mode 100644
index 27b7add6ef..0000000000
--- a/compute_kernel_writer/include/acl/AclKernelWriter.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_ACL_ACLKERNELWRITER_H
-#define CKW_INCLUDE_ACL_ACLKERNELWRITER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-class AclComponentArgument;
-
-namespace ckw
-{
-class Kernel;
-} // namespace ckw
-
-/** Extended implementation of kernel writer for dynamic fusion. */
-class AclKernelWriter : public ckw::KernelWriter
-{
-public:
-    /** Initialize a new instance of @ref AclKernelWriter class.
-     *
-     * @param[in] kernel The kernel to be generated.
-     */
-    explicit AclKernelWriter(ckw::Kernel &kernel);
-
-    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
-     *
-     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
-     * @param[in] sampler        The tensor sampling information to load the tile.
-     */
-    void op_load_once(AclComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
-};
-
-#endif // CKW_INCLUDE_ACL_ACLKERNELWRITER_H
diff --git a/compute_kernel_writer/include/acl/AclScopedKernelWriter.h b/compute_kernel_writer/include/acl/AclScopedKernelWriter.h
deleted file mode 100644
index 6cb957bfb5..0000000000
--- a/compute_kernel_writer/include/acl/AclScopedKernelWriter.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_ACL_ACLSCOPEDKERNELWRITER_H
-#define CKW_INCLUDE_ACL_ACLSCOPEDKERNELWRITER_H
-
-#include <cstdint>
-
-class AclKernelWriter;
-
-/** Helper to automatically manage kernel writer ID space. */
-class AclScopedKernelWriter
-{
-public:
-    /** Initialize a new instance of @ref AclScopedKernelWriter class. */
-    explicit AclScopedKernelWriter(AclKernelWriter *writer);
-
-    /** Create a new scope from the specified scoped kernel writer. */
-    AclScopedKernelWriter(const AclScopedKernelWriter &other);
-
-    /** Assignment is disallowed. */
-    AclScopedKernelWriter &operator=(const AclScopedKernelWriter &) = delete;
-
-    /** Access the underlying kernel writer. */
-    AclKernelWriter *operator->();
-
-    /** Access the underlying kernel writer. */
-    const AclKernelWriter *operator->() const;
-
-    /** Get the kernel writer. */
-    AclKernelWriter *writer();
-
-    /** Get the kernel writer. */
-    const AclKernelWriter *writer() const;
-
-private:
-    AclKernelWriter *_writer;
-    int32_t          _parent_id_space;
-};
-
-#endif // CKW_INCLUDE_ACL_ACLSCOPEDKERNELWRITER_H
diff --git a/compute_kernel_writer/include/ckw/Error.h b/compute_kernel_writer/include/ckw/Error.h
index 8c4853722b..996893823e 100644
--- a/compute_kernel_writer/include/ckw/Error.h
+++ b/compute_kernel_writer/include/ckw/Error.h
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef COMPUTE_KERNEL_WRITER_INCLUDE_CKW_ERROR_H
+#define COMPUTE_KERNEL_WRITER_INCLUDE_CKW_ERROR_H
 
-#ifndef CKW_INCLUDE_CKW_ERROR_H
-#define CKW_INCLUDE_CKW_ERROR_H
-
-#include <stdexcept>
 #include <string>
+#include <stdexcept>
 
 namespace ckw
 {
@@ -45,59 +44,16 @@ std::string create_error_msg(const std::string &file, const std::string &func, c
  *
  * @param[in] msg Message to display.
  */
-#define COMPUTE_KERNEL_WRITER_ERROR_ON_MSG(msg)                       \
-    do                                                                \
-    {                                                                 \
-        const std::string arg0(__FILE__);                             \
-        const std::string arg1(__func__);                             \
-        const std::string arg2(std::to_string(__LINE__));             \
-        const std::string arg3(msg);                                  \
+#define COMPUTE_KERNEL_WRITER_ERROR_ON_MSG(msg)     \
+    do                                      \
+    {                                       \
+        const std::string arg0(__FILE__);   \
+        const std::string arg1(__func__);   \
+        const std::string arg2(std::to_string(__LINE__));   \
+        const std::string arg3(msg);         \
         std::runtime_error(create_error_msg(arg0, arg1, arg2, arg3)); \
     } while(false)
 
-/** If the condition is not met, throw an std::runtime_error with the specified message.
- *
- * @param[in] cond The condition that is expected to be true.
- * @param[in] msg  The error message when the condition is not met.
- */
-#define CKW_ASSERT_MSG(cond, msg)            \
-    do                                       \
-    {                                        \
-        if(!(cond))                          \
-        {                                    \
-            throw ::std::runtime_error(msg); \
-        }                                    \
-    } while(false)
-
-/** If the condition is not met, throw an std::runtime_error.
- *
- * @param[in] cond The condition that is expected to be true.
- */
-#define CKW_ASSERT(cond) CKW_ASSERT_MSG(cond, #cond)
-
-/** If the precondition is met but the consequence is not met, throw an std::runtime_error.
- *
- * @param[in] precond The condition if is met requires the consequence must also be met.
- * @param[in] cond    The condition that is expected to be true if the precondition is true.
- */
-#define CKW_ASSERT_IF(precond, cond) \
-    CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
-
-/** Mark the variables as unused.
- *
- * @param[in] ... Variables which are unused.
- */
-#define CKW_UNUSED(...) ::ckw::ignore_unused(__VA_ARGS__) // NOLINT
-
-/** Mark the variables as unused.
- *
- * @param[in] ... Variables which are unused.
- */
-template <typename... T>
-inline void ignore_unused(T &&...)
-{
-}
-
 } // namespace ckw
 
-#endif // CKW_INCLUDE_CKW_ERROR_H
+#endif /* COMPUTE_KERNEL_WRITER_INCLUDE_CKW_ERROR_H */
diff --git a/compute_kernel_writer/include/ckw/Kernel.h b/compute_kernel_writer/include/ckw/Kernel.h
deleted file mode 100644
index cbc7700c22..0000000000
--- a/compute_kernel_writer/include/ckw/Kernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_KERNEL_H
-#define CKW_INCLUDE_CKW_KERNEL_H
-
-#include "ckw/OperandBase.h"
-#include "ckw/Types.h"
-
-#include <map>
-#include <memory>
-#include <string>
-
-namespace ckw
-{
-
-namespace prototype
-{
-class GpuKernelWriterDataHolder;
-} // namespace prototype
-
-/** The target for kernel writer to write into. */
-class Kernel
-{
-public:
-    /** Constructor
-     *
-     * @param[in] name     The name of the kernel function.
-     * @param[in] language The programming language to write the kernel.
-     */
-    Kernel(const char *name, GpuTargetLanguage language);
-
-    /** Destructor */
-    ~Kernel();
-
-    /** Get the name of the kernel function. */
-    const std::string &name() const;
-
-    /** (Internal use only) Get the map from operand name to the operand declared in this kernel. */
-    const ::std::map<::std::string, ::std::unique_ptr<OperandBase>> &operands() const;
-
-    /** (Internal use only) Get the map from operand name to the operand declared in this kernel. */
-    ::std::map<::std::string, ::std::unique_ptr<OperandBase>> &operands();
-
-    /** (Internal use only) Get the implementation data. */
-    prototype::GpuKernelWriterDataHolder *impl();
-
-private:
-    ::std::string                                             _name;
-    ::std::unique_ptr<prototype::GpuKernelWriterDataHolder>   _kernel;
-    ::std::map<::std::string, ::std::unique_ptr<OperandBase>> _operands;
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_KERNEL_H
diff --git a/compute_kernel_writer/include/ckw/KernelWriter.h b/compute_kernel_writer/include/ckw/KernelWriter.h
deleted file mode 100644
index 5dce62a14c..0000000000
--- a/compute_kernel_writer/include/ckw/KernelWriter.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_KERNELWRITER_H
-#define CKW_INCLUDE_CKW_KERNELWRITER_H
-
-#include "ckw/Kernel.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TileInfo.h"
-#include "ckw/TileOperand.h"
-
-#include <memory>
-
-namespace ckw
-{
-
-namespace prototype
-{
-struct GpuKernelWriterAttribute;
-class IGpuKernelWriter;
-} // namespace prototype
-
-/** Kernel writer. */
-class KernelWriter
-{
-public:
-    // =============================================================================================
-    // Constructors and destructor
-    // =============================================================================================
-
-    /** Initialize a new instance of kernel writer.
-     *
-     * @param[in] kernel The kernel to be written to.
-     */
-    explicit KernelWriter(Kernel &kernel);
-
-    /** Destructor */
-    ~KernelWriter();
-
-    /** No copy constructor. */
-    KernelWriter(const KernelWriter &) = delete;
-
-    /** No copy assignment. */
-    KernelWriter &operator=(const KernelWriter &) = delete;
-
-    // =============================================================================================
-    // Scope management
-    // =============================================================================================
-
-    /** Get the current ID space. */
-    int32_t id_space() const;
-
-    /** Set the current ID space. */
-    KernelWriter &id_space(int32_t id_space);
-
-    /** Switch to and return a new ID space. */
-    int32_t next_id_space();
-
-    // =============================================================================================
-    // Tensor and tile declaration
-    // =============================================================================================
-
-    /** Define a tensor argument.
-     *
-     * @param[in] name The name of the tensor.
-     * @param[in] info The tensor info.
-     *
-     * @return The @ref TensorOperand object.
-     */
-    TensorOperand &create_tensor_argument(const char *name, const TensorInfo &info);
-
-    /** Define a compile-time constant scalar argument.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     *
-     * @return The @ref TileOperand object.
-     */
-    TileOperand &create_tile_argument(const char *name, int32_t value);
-
-    /** Declare a new tile.
-     *
-     * The name of the tile must be unique in the current ID space.
-     *
-     * @param[in] name The name of the tile.
-     * @param[in] ...  The necessary arguments to create a new @ref TileOperand.
-     *
-     * @return The @ref TileOperand object.
-     */
-    template <typename... TArgs>
-    TileOperand &declare_tile(const char *name, TArgs &&...args)
-    {
-        const auto var_name = generate_variable_name(name);
-        auto       operand  = new TileOperand(var_name, ::std::forward<TArgs>(args)...);
-        register_operand(operand, true);
-
-        return *operand;
-    }
-
-    // =============================================================================================
-    // Load and store
-    // =============================================================================================
-
-    /** Load the data from the tensor memory to the tile using the sampling information.
-     *
-     * @param[out] tile    The tile to be loaded.
-     * @param[in]  tensor  The tensor to be read.
-     * @param[in]  sampler The tensor sampling information.
-     */
-    void op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler);
-
-    /** Store the tile to the tensor using the specified sampling information.
-     *
-     * @param[out] dst     The tensor that the tile is written to.
-     * @param[in]  src     The tile to be stored.
-     * @param[in]  sampler The tensor sampling information.
-     */
-    void op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler);
-
-    // =============================================================================================
-    // Data processing
-    // =============================================================================================
-
-    /** Write assignment: `<dst> = <src>`.
-     *
-     * @param[in] dst The destination tile.
-     * @param[in] src The source tile.
-     */
-    void op_assign(TileOperand &dst, const TileOperand &src);
-
-    /** Write binary expression: `<dst> = <lhs> <op> <rhs>`.
-     *
-     * @param[in] dst The destination tile.
-     * @param[in] lhs The LHS operand.
-     * @param[in] rhs The RHS operand.
-     * @param[in] op  The binary operator.
-     */
-    void op_binary_expression(TileOperand &dst, const TileOperand &lhs, const TileOperand &rhs, BinaryOp op);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<src>)`.
-     *
-     * @param[in] dst  The destination tile.
-     * @param[in] src  The source tile.
-     * @param[in] func The function to be applied to the source tile.
-     */
-    void op_scalar_function(TileOperand &dst, const TileOperand &src, ScalarUnaryFunction func);
-
-    // =============================================================================================
-    // Misc
-    // =============================================================================================
-
-    /** Set `dst` the global ID of dimension `dim`.
-     *
-     * @param[in] dst The tile to be written to.
-     * @param[in] dim The global ID dimension.
-     */
-    void op_get_global_id(TileOperand &dst, int32_t dim);
-
-    // =============================================================================================
-    // Code generation
-    // =============================================================================================
-
-    /** Generate the source code of the kernel. */
-    ::std::string generate_code();
-
-private:
-    /** Generate the full variable name based on the original name and the ID space.
-     *
-     * @param[in] name The name of the variable.
-     *
-     * @return The full variable name.
-     */
-    ::std::string generate_variable_name(const char *name) const;
-
-    /** Register the operand to the kernel.
-     *
-     * The operand is uniquely owned by the kernel afterward.
-     *
-     * @param[in] operand   The operand to be registered.
-     * @param[in] declaring Whether the tile declaration is generated.
-     */
-    void register_operand(OperandBase *operand, bool declaring);
-
-private:
-    Kernel                                                *_kernel;
-    ::std::unique_ptr<prototype::GpuKernelWriterAttribute> _impl_attr;
-    ::std::unique_ptr<prototype::IGpuKernelWriter>         _impl;
-
-    int32_t _id_space{ 0 };
-    int32_t _max_id_space{ 0 };
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_KERNELWRITER_H
diff --git a/compute_kernel_writer/include/ckw/OperandBase.h b/compute_kernel_writer/include/ckw/OperandBase.h
deleted file mode 100644
index 0ea5030968..0000000000
--- a/compute_kernel_writer/include/ckw/OperandBase.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_OPERANDBASE_H
-#define CKW_INCLUDE_CKW_OPERANDBASE_H
-
-#include "ckw/Types.h"
-#include <string>
-
-namespace ckw
-{
-namespace prototype
-{
-class IGpuKernelWriter;
-class Operand;
-} // namespace prototype
-
-/** The base class for all operands. */
-class OperandBase
-{
-public:
-    /** Constructor
-     *
-     * @param[in] name The name of the operand.
-     */
-    explicit OperandBase(const ::std::string &name);
-
-    /** Destructor */
-    virtual ~OperandBase();
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const = 0;
-
-    /** Get the name of the operand. */
-    const ::std::string &name() const;
-
-    /** Set the name of the operand. */
-    OperandBase &name(const ::std::string &name);
-
-    /** Get the data type of the operand. */
-    virtual DataType data_type() const = 0;
-
-    /** Get whether the operand is compile-time constant. */
-    virtual bool is_constant() const = 0;
-
-private:
-    ::std::string _name;
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_OPERANDBASE_H
diff --git a/compute_kernel_writer/include/ckw/ScalarValue.h b/compute_kernel_writer/include/ckw/ScalarValue.h
deleted file mode 100644
index cf017d435f..0000000000
--- a/compute_kernel_writer/include/ckw/ScalarValue.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_SCALARVALUE_H
-#define CKW_INCLUDE_CKW_SCALARVALUE_H
-
-#include "ckw/Error.h"
-
-#include <cstdint>
-
-namespace ckw
-{
-
-/** The scalar value known at compile-time. */
-class ScalarValue
-{
-public:
-    /** Initialize a new instance of @ref ScalarValue class with integer value 0. */
-    ScalarValue()
-    {
-        _type      = Type::INT;
-        _value.i64 = 0;
-    }
-
-    /** Initialize a new instance of @ref ScalarValue class with the specified value. */
-    template <typename T>
-    ScalarValue(T value)
-    {
-        set(value);
-    }
-
-    /** Set the value. */
-    template <typename T>
-    void set(T value)
-    {
-        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
-        CKW_ASSERT(sizeof(T) <= 8);
-
-        _size = sizeof(T);
-
-        if(::std::is_integral<T>::value)
-        {
-            if(::std::is_signed<T>::value)
-            {
-                _type      = Type::INT;
-                _value.i64 = value;
-            }
-            else
-            {
-                _type      = Type::UINT;
-                _value.u64 = value;
-            }
-        }
-        else
-        {
-            _type      = Type::FLOAT;
-            _value.f64 = value;
-        }
-    }
-
-    /** Get the value.
-     *
-     * The caller must make sure that what has been stored in the object must fit
-     * the output data type without data corruption or loss of accuracy.
-     */
-    template <typename T>
-    T get() const
-    {
-        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
-        CKW_ASSERT(sizeof(T) >= _size);
-
-        if(::std::is_integral<T>::value)
-        {
-            if(::std::is_signed<T>::value)
-            {
-                CKW_ASSERT(_type == Type::INT || _type == Type::UINT);
-                CKW_ASSERT_IF(_type == Type::UINT, sizeof(T) > _size);
-
-                return _value.i64;
-            }
-            else
-            {
-                CKW_ASSERT(_type == Type::INT);
-
-                return _value.u64;
-            }
-        }
-        else
-        {
-            return _value.f64;
-        }
-    }
-
-private:
-    union Value
-    {
-        int64_t  i64;
-        uint64_t u64;
-        double   f64;
-    };
-
-    enum class Type : int32_t
-    {
-        UINT,
-        INT,
-        FLOAT,
-    };
-
-    Value    _value{};
-    Type     _type{};
-    uint32_t _size{};
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_SCALARVALUE_H
diff --git a/compute_kernel_writer/include/ckw/TensorOperand.h b/compute_kernel_writer/include/ckw/TensorOperand.h
deleted file mode 100644
index 130ab596fb..0000000000
--- a/compute_kernel_writer/include/ckw/TensorOperand.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_TENSOROPERAND_H
-#define CKW_INCLUDE_CKW_TENSOROPERAND_H
-
-#include "ckw/OperandBase.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-#include "ckw/Types.h"
-
-#include <memory>
-
-namespace ckw
-{
-
-class TensorComponentOperand;
-
-// =================================================================================================
-// TensorOperand
-// =================================================================================================
-
-/** Tensor operand */
-class TensorOperand : public OperandBase
-{
-public:
-    /** Initialize a new instance of @ref TensorOperand class.
-     *
-     * @param[in] name       The name of the tensor.
-     * @param[in] info       The tensor info.
-     */
-    TensorOperand(const ::std::string &name, const TensorInfo &info);
-
-    /** No copy constructor. */
-    TensorOperand(const TensorOperand &other) = delete;
-
-    /** No copy assignment. */
-    TensorOperand &operator=(const TensorOperand &other) = delete;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-    /** Get the tensor info. */
-    const TensorInfo &info() const;
-
-    /** Get the tensor info. */
-    TensorInfo &info();
-
-    /** Get the data type. */
-    virtual DataType data_type() const override;
-
-    /** Get whether the tensor is compile-time constant. */
-    virtual bool is_constant() const override;
-
-    /** Get the default tile attached to the tensor. */
-    const TileOperand &tile() const;
-
-    /** Get the default tile attached to the tensor. */
-    TileOperand &tile();
-
-    /** Set the default tile attached to the tensor. */
-    TensorOperand &tile(TileOperand &tile);
-
-    /** Get the tensor sampler of the default tile. */
-    const TensorTileSampler &tile_sampler() const;
-
-    /** Get the tensor sampler of the default tile. */
-    TensorTileSampler &tile_sampler();
-
-    /** Set the tensor sampler of the default tile. */
-    TensorOperand &tile_sampler(const TensorTileSampler &value);
-
-    /** Get the operand that contains the stride in y dimension of the tensor. */
-    TileOperand &stride1();
-
-    /** Get the operand that contains the stride in z dimension of the tensor. */
-    TileOperand &stride2();
-
-    /** Get the operand that contains the stride in w dimension of the tensor. */
-    TileOperand &stride3();
-
-    /** Get the operand that contains the stride in w dimension of the tensor. */
-    TileOperand &stride4();
-
-    /** Get the operand that contains the size of dimension 0 of the tensor. */
-    TileOperand &dim0();
-
-    /** Get the operand that contains the size of dimension 1 of the tensor. */
-    TileOperand &dim1();
-
-    /** Get the operand that contains the size of dimension 2 of the tensor. */
-    TileOperand &dim2();
-
-    /** Get the operand that contains the size of dimension 3 of the tensor. */
-    TileOperand &dim3();
-
-    /** Get the operand that contains the size of dimension 4 of the tensor. */
-    TileOperand &dim4();
-
-    /** Get the operand that contains the size of dimensions 1 and 2 collapsed. */
-    TileOperand &dim1_dim2();
-
-    /** Get the operand that contains the size of dimensions 1, 2 and 3 collapsed. */
-    TileOperand &dim1_dim2_dim3();
-
-    /** Get the operand that contains the offset in bytes to the first element. */
-    TileOperand &offset_first_element_in_bytes();
-
-private:
-    TensorInfo _info;
-
-    TileOperand  *_tile{ nullptr };
-    TensorTileSampler _tile_sampler{};
-
-    ::std::unique_ptr<TensorComponentOperand> _stride1{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _stride4{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim0{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim4{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{ nullptr };
-    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{ nullptr };
-};
-
-// =================================================================================================
-// TensorComponentOperand
-// =================================================================================================
-
-/** Tile operand that contains tensor information. */
-class TensorComponentOperand : public TileOperand
-{
-public:
-    /** Initialize a new instance of @ref TensorComponentOperand class.
-     *
-     * @param[in] name      The name of the operand.
-     * @param[in] component The tensor info component.
-     */
-    TensorComponentOperand(const ::std::string &name, TensorComponent component);
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-private:
-    TensorComponent _component;
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_TENSOROPERAND_H
diff --git a/compute_kernel_writer/include/ckw/TensorTileSampler.h b/compute_kernel_writer/include/ckw/TensorTileSampler.h
deleted file mode 100644
index 5ef7bca647..0000000000
--- a/compute_kernel_writer/include/ckw/TensorTileSampler.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_TENSORTILESAMPLER_H
-#define CKW_INCLUDE_CKW_TENSORTILESAMPLER_H
-
-#include "ckw/Types.h"
-#include <functional>
-
-namespace ckw
-{
-
-class TileOperand;
-
-/** Tensor sampler
- *
- * It contains information about how the result tile should be stored to tensor memory.
- * It can also be used to dictate how the subsequent operators fetch the input tensor.
- */
-class TensorTileSampler
-{
-public:
-    /** Initialize a new instance of @ref TensorSampler class. */
-    TensorTileSampler();
-
-    /** Initialize a new instance of @ref TensorSampler class.
-     *
-     * @param[in] x              The coordinate in the x dimension.
-     * @param[in] y              The coordinate in the y dimension.
-     * @param[in] z              The coordinate in the z dimension.
-     * @param[in] b              The coordinate in the batch dimension.
-     * @param[in] format         The tensor data format.
-     * @param[in] address_mode_x The address mode of the x dimension.
-     * @param[in] address_mode_y The address mode of the y dimension.
-     * @param[in] address_mode_z The address mode of the z dimension.
-     */
-    TensorTileSampler(
-        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
-
-    /** Initialize a new instance of @ref TensorSampler class.
-     *
-     * @param[in] x              The coordinate in the x dimension.
-     * @param[in] y              The coordinate in the y dimension.
-     * @param[in] z              The coordinate in the z dimension.
-     * @param[in] b              The coordinate in the batch dimension.
-     * @param[in] height         The height of the tile.
-     * @param[in] width          The width of the tile.
-     * @param[in] format         The tensor data format.
-     * @param[in] address_mode_x The address mode of the x dimension.
-     * @param[in] address_mode_y The address mode of the y dimension.
-     * @param[in] address_mode_z The address mode of the z dimension.
-     */
-    TensorTileSampler(
-        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-        int32_t height, int32_t width,
-        TensorSamplerFormat       format,
-        TensorSamplerAddressModeX address_mode_x,
-        TensorSamplerAddressModeY address_mode_y,
-        TensorSamplerAddressModeZ address_mode_z);
-
-    /** Get the coordinate in the x dimension. */
-    const TileOperand &x() const;
-
-    /** Set the coordinate in the x dimension. */
-    TensorTileSampler &x(TileOperand &x);
-
-    /** Get the coordinate in the y dimension. */
-    const TileOperand &y() const;
-
-    /** Set the coordinate in the y dimension. */
-    TensorTileSampler &y(TileOperand &y);
-
-    /** Get the coordinate in the z dimension. */
-    const TileOperand &z() const;
-
-    /** Set the coordinate in the z dimension. */
-    TensorTileSampler &z(TileOperand &z);
-
-    /** Get the coordinate in the batch dimension. */
-    const TileOperand &b() const;
-
-    /** Set the coordinate in the batch dimension. */
-    TensorTileSampler &b(TileOperand &b);
-
-    /** Get the width of the tile. */
-    int32_t width() const;
-
-    /** Set the width of the tile. */
-    TensorTileSampler &width(int32_t width);
-
-    /** Get the height of the tile. */
-    int32_t height() const;
-
-    /** Set the height of the tile. */
-    TensorTileSampler &height(int32_t height);
-
-    /** Get the format of the tensor. */
-    TensorSamplerFormat format() const;
-
-    /** Set the format of the tensor. */
-    TensorTileSampler &format(TensorSamplerFormat format);
-
-    /** Get the address mode of the x dimension. */
-    TensorSamplerAddressModeX address_mode_x() const;
-
-    /** Set the address mode of the x-dimension. */
-    TensorTileSampler &address_mode_x(TensorSamplerAddressModeX address_mode_x);
-
-    /** Get the address mode of the y dimension. */
-    TensorSamplerAddressModeY address_mode_y() const;
-
-    /** Set the address mode of the y dimension. */
-    TensorTileSampler &address_mode_y(TensorSamplerAddressModeY address_mode_y);
-
-    /** Get the address mode of the z dimension. */
-    TensorSamplerAddressModeZ address_mode_z() const;
-
-    /** Set the address mode of the z dimension. */
-    TensorTileSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
-
-private:
-    TileOperand *_x{ nullptr };
-    TileOperand *_y{ nullptr };
-    TileOperand *_z{ nullptr };
-    TileOperand *_b{ nullptr };
-
-    int32_t _height{ 0 };
-    int32_t _width{ 0 };
-
-    TensorSamplerFormat       _format{ TensorSamplerFormat::Unknown };
-    TensorSamplerAddressModeX _address_mode_x{ TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY _address_mode_y{ TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ _address_mode_z{ TensorSamplerAddressModeZ::Unknown };
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_TENSORTILESAMPLER_H
diff --git a/compute_kernel_writer/include/ckw/TileOperand.h b/compute_kernel_writer/include/ckw/TileOperand.h
deleted file mode 100644
index 1eee18589f..0000000000
--- a/compute_kernel_writer/include/ckw/TileOperand.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_TILEOPERAND_H
-#define CKW_INCLUDE_CKW_TILEOPERAND_H
-
-#include "ckw/Error.h"
-#include "ckw/OperandBase.h"
-#include "ckw/ScalarValue.h"
-#include "ckw/TileInfo.h"
-
-#include <vector>
-
-namespace ckw
-{
-
-class Kernel;
-
-/** Tile operand which can be either scalar, vector or 2D tile. */
-class TileOperand : public OperandBase
-{
-public:
-    /** Initialize a new instance of @ref TileOperand class with the tile information.
-     *
-     * @param[in] name      The name of the tile.
-     * @param[in] tile_info The tile info.
-     */
-    TileOperand(const ::std::string &name, const TileInfo &tile_info);
-
-    /** Initialize a new instance of @ref TileOperand for scalar variable.
-     *
-     * @param[in] name      The name of the tile.
-     * @param[in] data_type The data type of the tile.
-     */
-    TileOperand(const ::std::string &name, DataType data_type);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, int32_t value);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, float value);
-
-    /** Prohibit copy of tile operand. */
-    TileOperand(const TileOperand &) = delete;
-
-    /** Prohibit copy of tile operand. */
-    TileOperand &operator=(const TileOperand &) = delete;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-    /** Get the tile info. */
-    const TileInfo &tile_info() const;
-
-    /** Get the data type of the tile. */
-    virtual DataType data_type() const override;
-
-    /** Get whether the tile is compile-time constant. */
-    virtual bool is_constant() const override;
-
-    /** Get whether the tile is a scalar value. */
-    bool is_scalar() const;
-
-    /** Get the scalar value of the tile.
-     *
-     * The tile must have the shape of 1, 1 (i.e. scalar).
-     */
-    ScalarValue scalar_value() const;
-
-private:
-    TileInfo    _info;
-    ScalarValue _value{};
-    bool        _constant;
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_TILEOPERAND_H
diff --git a/compute_kernel_writer/include/ckw/Types.h b/compute_kernel_writer/include/ckw/Types.h
index 5516718e54..c9f80b65e0 100644
--- a/compute_kernel_writer/include/ckw/Types.h
+++ b/compute_kernel_writer/include/ckw/Types.h
@@ -21,120 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#ifndef CKW_INCLUDE_CKW_TYPES_H
-#define CKW_INCLUDE_CKW_TYPES_H
-
-#include <array>
-#include <cstdint>
+#ifndef COMPUTE_KERNEL_WRITER_INCLUDE_CKW_TYPES_H
+#define COMPUTE_KERNEL_WRITER_INCLUDE_CKW_TYPES_H
 
 namespace ckw
 {
-
 /** Compute Kernel Writer data types. This data type is used by the code variables and tensor arguments. */
 enum class DataType
-{
-    Unknown = 0x00,
-    Fp32    = 0x11,
-    Fp16    = 0x12,
-    Int32   = 0x21,
-    Int16   = 0x22,
-    Int8    = 0x24,
-    Uint32  = 0x31,
-    Uint16  = 0x32,
-    Uint8   = 0x34,
-    Bool    = 0x41
-};
-
-enum class GpuTargetLanguage
 {
     Unknown,
-    OpenCL
-};
-
-/* Binary operations
-*/
-enum class BinaryOp : int32_t
-{
-    // Elementwise
-    Add = 0x0000, // +
-    Sub = 0x0001, // -
-    Mul = 0x0002, // *
-    Div = 0x0003, // /
-    Mod = 0x0004, // %
-    // Relational
-    Equal        = 0x1000, // ==
-    Less         = 0x1001, // <
-    LessEqual    = 0x1002, // <=
-    Greater      = 0x1003, // >
-    GreaterEqual = 0x1004, // >=
-    // Algebra
-    MatMul_Nt_Nt = 0x2000, // X
-    MatMul_Nt_T  = 0x2001, // X
-    MatMul_T_Nt  = 0x2002, // X
-    MatMul_T_T   = 0x2003, // X
-    Dot          = 0x2004, // .
-    // Logical
-    LogicalAnd = 0x3000, // &&
-    LogicalOr  = 0x3001, // ||
-    LogicalNot = 0x3002  // !
-};
-
-enum class AssignmentOp : int32_t
-{
-    // Unary
-    Increment = 0x0000, // +=
-    Decrement = 0x0001, // -=
-};
-
-enum class ScalarUnaryFunction : int32_t
-{
-    Exp,
-};
-
-enum class TensorSamplerFormat : int32_t
-{
-    Unknown = 0,
-    C_WH_1  = 1,
-    C_W_H   = 2
-};
-
-enum class TensorSamplerAddressModeX : int32_t
-{
-    Unknown        = 0,
-    None           = 1, // The user guarantees that the X coordinate is always in-bound
-    OverlappingMin = 2  // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-                        // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
+    Fp32,
+    Fp16,
+    Int32,
+    Int16,
+    Int8,
+    Uint32,
+    Uint16,
+    Uint8,
+    Bool
 };
-
-enum class TensorSamplerAddressModeY : int32_t
-{
-    Unknown                  = 0,
-    None                     = 1, // The user guarantees that the Y coordinate is always in-bound
-    OverlappingMin           = 2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    Skip                     = 3, // Skip the read/write
-    SkipMinEdgeOnly          = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly          = 5, // Skip less than 0 only
-    ClampToNearest           = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly       = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly       = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-    ClampToBorder            = 9, // Clamp to border which always has 0 value
-    ClampToBorderMinEdgeOnly = 10,
-    ClampToBorderMaxEdgeOnly = 11
-};
-
-enum class TensorSamplerAddressModeZ : int32_t
-{
-    Unknown            = 0,
-    None               = 1, // The user guarantees that the Y coordinate is always in-bound
-    Skip               = 3, // Skip the read/write
-    SkipMinEdgeOnly    = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
-    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-};
-
 } // namespace ckw
 
-#endif // CKW_INCLUDE_CKW_TYPES_H
+#endif /* COMPUTE_KERNEL_WRITER_INCLUDE_CKW_TYPES_H */
diff --git a/compute_kernel_writer/prototype/CMakeLists.txt b/compute_kernel_writer/prototype/CMakeLists.txt
new file mode 100644
index 0000000000..84436a95d6
--- /dev/null
+++ b/compute_kernel_writer/prototype/CMakeLists.txt
@@ -0,0 +1,72 @@
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+#---------------------------------------------------------------------
+# Prototype
+
+add_library(ckw_prototype
+    src/TileInfo.cpp
+    src/TensorInfo.cpp
+    src/Kernel.cpp
+    src/KernelWriter.cpp
+    src/OperandBase.cpp
+    src/TileOperand.cpp
+    src/TensorOperand.cpp
+    src/TensorTileSampler.cpp
+)
+
+target_compile_options(ckw_prototype
+    PUBLIC
+    ${CKW_CXX_FLAGS}
+    "$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
+    "$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
+    "$<$<BOOL:${CKW_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
+    ${CMAKE_CXX_FLAGS}
+)
+
+target_compile_definitions(ckw_prototype PUBLIC
+    $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
+    $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
+    $<$<BOOL:${CKW_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
+    $<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
+)
+
+target_include_directories(ckw_prototype
+    PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include
+    PRIVATE ${CMAKE_CURRENT_LIST_DIR}
+)
+
+#---------------------------------------------------------------------
+# Examples
+
+add_library(ckw_prototype_examples_common
+    examples/common/ExampleKernelWriter.cpp
+    examples/common/ExampleScopedKernelWriter.cpp
+    examples/common/ExampleComponentArgument.cpp
+)
+
+target_link_libraries(ckw_prototype_examples_common PUBLIC ckw_prototype)
+
+add_executable(ckw_prototype_examples_add_exp_store examples/add_exp_store.cpp)
+target_link_libraries(ckw_prototype_examples_add_exp_store PUBLIC ckw_prototype_examples_common)
diff --git a/compute_kernel_writer/prototype/examples/add_exp_store.cpp b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
new file mode 100644
index 0000000000..9ee21957f1
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/Error.h"
+#include "ckw/KernelWriter.h"
+#include "ckw/TensorOperand.h"
+#include "ckw/TensorTileSampler.h"
+#include "ckw/TileOperand.h"
+#include "ckw/Types.h"
+
+#include "common/ExampleComponentArgument.h"
+#include "common/ExampleKernelWriter.h"
+#include "common/ExampleScopedKernelWriter.h"
+
+#include <iostream>
+#include <vector>
+
+using namespace ckw;
+
+TensorTileSampler create_simple_sampler(ExampleScopedKernelWriter writer)
+{
+    TensorTileSampler sampler;
+
+    constexpr int32_t m0 = 4;
+    constexpr int32_t n0 = 4;
+
+    auto &gid_0 = writer->declare_tile("gid_0", DataType::Int32);
+    auto &gid_1 = writer->declare_tile("gid_1", DataType::Int32);
+    auto &gid_2 = writer->declare_tile("gid_2", DataType::Int32);
+
+    auto &const_0 = writer->declare_tile("0", 0);
+
+    writer->op_get_global_id(gid_0, 0);
+    writer->op_get_global_id(gid_1, 1);
+    writer->op_get_global_id(gid_2, 2);
+
+    sampler.x(gid_0);
+    sampler.y(gid_1);
+    sampler.z(const_0);
+    sampler.b(gid_2);
+
+    sampler.width(n0);
+    sampler.height(m0);
+
+    sampler.format(TensorSamplerFormat::C_WH_1);
+    sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
+    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
+
+    return sampler;
+}
+
+void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
+{
+    auto lhs = operands.at(0);
+    auto rhs = operands.at(1);
+    auto dst = operands.at(2);
+
+    // Load the LHS and RHS tile and prepare the tensor sampler.
+    if(!lhs->has_tile() && !rhs->has_tile())
+    {
+        const auto sampler = create_simple_sampler(writer);
+
+        writer->op_load_once(lhs, sampler);
+        writer->op_load_once(rhs, sampler);
+    }
+    else if(lhs->has_tile())
+    {
+        const auto &sampler = lhs->tile_sampler();
+        writer->op_load_once(rhs, sampler);
+    }
+    else
+    {
+        const auto &sampler = rhs->tile_sampler();
+        writer->op_load_once(lhs, sampler);
+    }
+
+    auto       &lhs_tile = lhs->tile();
+    auto       &rhs_tile = rhs->tile();
+    const auto &sampler  = lhs->tile_sampler();
+
+    // Prepare the output tile.
+    if(!dst->has_tile())
+    {
+        auto &tile = writer->declare_tile("dst_tile", lhs_tile.tile_info());
+        dst->init_virtual_tensor(tile, sampler);
+    }
+
+    auto &dst_tile = dst->tile();
+
+    // Perform the operation.
+    writer->op_binary_expression(dst_tile, lhs_tile, rhs_tile, BinaryOp::Add);
+}
+
+void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
+{
+    auto src = operands.at(0);
+    auto dst = operands.at(1);
+
+    // Load the source tile and prepare the sampler.
+    if(!src->has_tile())
+    {
+        const auto sampler = create_simple_sampler(writer);
+        writer->op_load_once(src, sampler);
+    }
+
+    auto       &src_tile = src->tile();
+    const auto &sampler  = src->tile_sampler();
+
+    // Prepare the output tile.
+    if(!dst->has_tile())
+    {
+        auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
+        dst->init_virtual_tensor(tile, sampler);
+    }
+
+    auto &dst_tile = dst->tile();
+
+    // Perform the operation.
+    writer->op_scalar_function(dst_tile, src_tile, ScalarUnaryFunction::Exp);
+}
+
+void op_store(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
+{
+    auto src = operands.at(0);
+    auto dst = operands.at(1);
+
+    auto       &src_tile   = src->tile();
+    const auto &sampler    = src->tile_sampler();
+    auto       &dst_tensor = dst->tensor();
+
+    writer->op_store(dst_tensor, src_tile, sampler);
+}
+
+int main()
+{
+    Kernel          kernel("example", GpuTargetLanguage::OpenCL);
+    ExampleKernelWriter root_writer(kernel);
+
+    ExampleScopedKernelWriter writer(&root_writer);
+
+    const TensorInfo src0_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 0);
+    const TensorInfo src1_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 1);
+    const TensorInfo dst_info(DataType::Fp32, TensorShape({ 3, 10, 20, 1, 1 }), TensorDataLayout::Nhwc, 2);
+
+    ExampleComponentArgument src0(writer->create_tensor_argument("src0", src0_info));
+    ExampleComponentArgument src1(writer->create_tensor_argument("src1", src1_info));
+    ExampleComponentArgument dst(writer->create_tensor_argument("dst", dst_info));
+
+    ExampleComponentArgument ans;
+
+    op_binary_elementwise(writer, { &src0, &src1, &ans });
+    op_exp(writer, { &ans, &ans });
+    op_store(writer, { &ans, &dst });
+
+    const auto code = root_writer.generate_code();
+    std::cout << code;
+
+    return 0;
+}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
new file mode 100644
index 0000000000..da5156ca8c
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ExampleComponentArgument.h"
+#include "ckw/Error.h"
+
+ExampleComponentArgument::ExampleComponentArgument()
+{
+}
+
+ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor)
+    : _tensor(&tensor)
+{
+}
+
+ExampleComponentArgument &ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+{
+    CKW_ASSERT(_tile == nullptr);
+
+    _tile         = &tile;
+    _tile_sampler = tile_sampler;
+
+    return *this;
+}
+
+bool ExampleComponentArgument::has_tensor() const
+{
+    return _tensor != nullptr;
+}
+
+ckw::TensorOperand &ExampleComponentArgument::tensor()
+{
+    CKW_ASSERT(_tensor != nullptr);
+
+    return *_tensor;
+}
+
+const ckw::TensorOperand &ExampleComponentArgument::tensor() const
+{
+    CKW_ASSERT(_tensor != nullptr);
+
+    return *_tensor;
+}
+
+bool ExampleComponentArgument::has_tile() const
+{
+    return _tile != nullptr;
+}
+
+ckw::TileOperand &ExampleComponentArgument::tile()
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return *_tile;
+}
+
+const ckw::TileOperand &ExampleComponentArgument::tile() const
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return *_tile;
+}
+
+ckw::TensorTileSampler &ExampleComponentArgument::tile_sampler()
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return _tile_sampler;
+}
+
+const ckw::TensorTileSampler &ExampleComponentArgument::tile_sampler() const
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return _tile_sampler;
+}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
new file mode 100644
index 0000000000..2de9042691
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
+#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
+
+#include "ckw/TensorTileSampler.h"
+
+namespace ckw
+{
+class TensorOperand;
+class TileOperand;
+} // namespace ckw
+
+/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
+class ExampleComponentArgument
+{
+public:
+    /** Initialize a new instance of @ref ExampleComponentArgument class for empty virtual tensor. */
+    ExampleComponentArgument();
+
+    /** Initialize a new instance of @ref ExampleComponentArgument class for user tensor.
+     *
+     * @param[in] tensor The user tensor.
+     */
+    explicit ExampleComponentArgument(ckw::TensorOperand &tensor);
+
+    /** Set virtual tensor information (tile, sampler) for the argument.
+     *
+     * If the component is a user tensor, it can be treated as virtual tensor as well
+     * and won't be loaded again using @ref ExampleKernelWriter::op_load_once method.
+     *
+     * @param[in] tile    The tile that has been loaded.
+     * @param[in] sampler The tensor sampling information that has been used to load the tile.
+     */
+    ExampleComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
+
+    /** Get whether the argument is a user tensor. */
+    bool has_tensor() const;
+
+    /** Get the tensor operand.
+     *
+     * If the tensor is not available, throw an error.
+     */
+    ckw::TensorOperand &tensor();
+
+    /** Get the tensor operand.
+     *
+     * If the tensor is not available, throw an error.
+     */
+    const ckw::TensorOperand &tensor() const;
+
+    /** Get whether the argument contains a tile.
+     *
+     * The argument can be either a user tensor that has been loaded,
+     * or a virtual tensor (i.e. a tile with tensor sampling information).
+     */
+    bool has_tile() const;
+
+    /** Get the tile operand.
+     *
+     * If the tile is not available, throw an error.
+     */
+    ckw::TileOperand &tile();
+
+    /** Get the tile operand.
+     *
+     * If the tile is not available, throw an error.
+     */
+    const ckw::TileOperand &tile() const;
+
+    /** Get the tensor sampling information for the tile.
+     *
+     * If the tile is not available, throw an error.
+     */
+    ckw::TensorTileSampler &tile_sampler();
+
+    /** Get the tensor sampling information for the tile.
+     *
+     * If the tile is not available, throw an error.
+     */
+    const ckw::TensorTileSampler &tile_sampler() const;
+
+private:
+    ckw::TensorOperand *_tensor{ nullptr };
+    ckw::TileOperand   *_tile{ nullptr };
+    ckw::TensorTileSampler  _tile_sampler{};
+};
+
+#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
new file mode 100644
index 0000000000..2c11ae36e5
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ExampleKernelWriter.h"
+#include "ExampleComponentArgument.h"
+#include "ckw/Error.h"
+#include "ckw/TileInfo.h"
+
+ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel)
+    : KernelWriter(kernel)
+{
+}
+
+void ExampleKernelWriter::op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
+{
+    if(!tensor_or_tile->has_tile())
+    {
+        CKW_ASSERT(tensor_or_tile->has_tensor());
+
+        auto &tensor = tensor_or_tile->tensor();
+
+        const auto tile_name = tensor.name() + "_tile";
+        auto      &tile      = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+
+        op_load(tile, tensor, sampler);
+
+        tensor_or_tile->init_virtual_tensor(tile, sampler);
+    }
+}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
new file mode 100644
index 0000000000..1528c3d933
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
+#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
+
+#include "ckw/KernelWriter.h"
+#include "ckw/TensorTileSampler.h"
+
+class ExampleComponentArgument;
+
+namespace ckw
+{
+class Kernel;
+} // namespace ckw
+
+/** Extended implementation of kernel writer for dynamic fusion. */
+class ExampleKernelWriter : public ckw::KernelWriter
+{
+public:
+    /** Initialize a new instance of @ref ExampleKernelWriter class.
+     *
+     * @param[in] kernel The kernel to be generated.
+     */
+    explicit ExampleKernelWriter(ckw::Kernel &kernel);
+
+    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
+     *
+     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
+     * @param[in] sampler        The tensor sampling information to load the tile.
+     */
+    void op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
+};
+
+#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
new file mode 100644
index 0000000000..7c44fa8749
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ExampleScopedKernelWriter.h"
+#include "ExampleKernelWriter.h"
+
+ExampleScopedKernelWriter::ExampleScopedKernelWriter(ExampleKernelWriter *writer)
+    : _writer(writer), _parent_id_space(writer->id_space())
+{
+    _writer->next_id_space();
+}
+
+ExampleScopedKernelWriter::ExampleScopedKernelWriter(const ExampleScopedKernelWriter &other)
+    : _writer(other._writer), _parent_id_space(other._writer->id_space())
+{
+    _writer->next_id_space();
+}
+
+ExampleKernelWriter *ExampleScopedKernelWriter::operator->()
+{
+    return _writer;
+}
+
+const ExampleKernelWriter *ExampleScopedKernelWriter::operator->() const
+{
+    return _writer;
+}
+
+ExampleKernelWriter *ExampleScopedKernelWriter::writer()
+{
+    return _writer;
+}
+
+const ExampleKernelWriter *ExampleScopedKernelWriter::writer() const
+{
+    return _writer;
+}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
new file mode 100644
index 0000000000..1aa0242c51
--- /dev/null
+++ b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
+#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
+
+#include <cstdint>
+
+class ExampleKernelWriter;
+
+/** Helper to automatically manage kernel writer ID space. */
+class ExampleScopedKernelWriter
+{
+public:
+    /** Initialize a new instance of @ref ExampleScopedKernelWriter class. */
+    explicit ExampleScopedKernelWriter(ExampleKernelWriter *writer);
+
+    /** Create a new scope from the specified scoped kernel writer. */
+    ExampleScopedKernelWriter(const ExampleScopedKernelWriter &other);
+
+    /** Assignment is disallowed. */
+    ExampleScopedKernelWriter &operator=(const ExampleScopedKernelWriter &) = delete;
+
+    /** Access the underlying kernel writer. */
+    ExampleKernelWriter *operator->();
+
+    /** Access the underlying kernel writer. */
+    const ExampleKernelWriter *operator->() const;
+
+    /** Get the kernel writer. */
+    ExampleKernelWriter *writer();
+
+    /** Get the kernel writer. */
+    const ExampleKernelWriter *writer() const;
+
+private:
+    ExampleKernelWriter *_writer;
+    int32_t          _parent_id_space;
+};
+
+#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/Error.h b/compute_kernel_writer/prototype/include/ckw/Error.h
new file mode 100644
index 0000000000..b18944eac5
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/Error.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_ERROR_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_ERROR_H
+
+#include <stdexcept>
+#include <string>
+
+namespace ckw
+{
+
+/** If the condition is not met, throw an std::runtime_error with the specified message.
+ *
+ * @param[in] cond The condition that is expected to be true.
+ * @param[in] msg  The error message when the condition is not met.
+ */
+#define CKW_ASSERT_MSG(cond, msg)            \
+    do                                       \
+    {                                        \
+        if(!(cond))                          \
+        {                                    \
+            throw ::std::runtime_error(msg); \
+        }                                    \
+    } while(false)
+
+/** If the condition is not met, throw an std::runtime_error.
+ *
+ * @param[in] cond The condition that is expected to be true.
+ */
+#define CKW_ASSERT(cond) CKW_ASSERT_MSG(cond, #cond)
+
+/** If the precondition is met but the consequence is not met, throw an std::runtime_error.
+ *
+ * @param[in] precond The condition if is met requires the consequence must also be met.
+ * @param[in] cond    The condition that is expected to be true if the precondition is true.
+ */
+#define CKW_ASSERT_IF(precond, cond) \
+    CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
+
+/** Mark the variables as unused.
+ *
+ * @param[in] ... Variables which are unused.
+ */
+#define CKW_UNUSED(...) ::ckw::ignore_unused(__VA_ARGS__) // NOLINT
+
+/** Mark the variables as unused.
+ *
+ * @param[in] ... Variables which are unused.
+ */
+template <typename... T>
+inline void ignore_unused(T &&...)
+{
+}
+
+} // namespace ckw
+
+#endif // CKW_INCLUDE_CKW_ERROR_H
diff --git a/compute_kernel_writer/prototype/include/ckw/Kernel.h b/compute_kernel_writer/prototype/include/ckw/Kernel.h
new file mode 100644
index 0000000000..57a8a40341
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/Kernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
+
+#include "ckw/OperandBase.h"
+#include "ckw/Types.h"
+
+#include <map>
+#include <memory>
+#include <string>
+
+namespace ckw
+{
+
+namespace prototype
+{
+class GpuKernelWriterDataHolder;
+} // namespace prototype
+
+/** The target for kernel writer to write into. */
+class Kernel
+{
+public:
+    /** Constructor
+     *
+     * @param[in] name     The name of the kernel function.
+     * @param[in] language The programming language to write the kernel.
+     */
+    Kernel(const char *name, GpuTargetLanguage language);
+
+    /** Destructor */
+    ~Kernel();
+
+    /** Get the name of the kernel function. */
+    const std::string &name() const;
+
+    /** (Internal use only) Get the map from operand name to the operand declared in this kernel. */
+    const ::std::map<::std::string, ::std::unique_ptr<OperandBase>> &operands() const;
+
+    /** (Internal use only) Get the map from operand name to the operand declared in this kernel. */
+    ::std::map<::std::string, ::std::unique_ptr<OperandBase>> &operands();
+
+    /** (Internal use only) Get the implementation data. */
+    prototype::GpuKernelWriterDataHolder *impl();
+
+private:
+    ::std::string                                             _name;
+    ::std::unique_ptr<prototype::GpuKernelWriterDataHolder>   _kernel;
+    ::std::map<::std::string, ::std::unique_ptr<OperandBase>> _operands;
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
new file mode 100644
index 0000000000..a2778a9485
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
+
+#include "ckw/Kernel.h"
+#include "ckw/TensorInfo.h"
+#include "ckw/TensorOperand.h"
+#include "ckw/TileInfo.h"
+#include "ckw/TileOperand.h"
+
+#include <memory>
+
+namespace ckw
+{
+
+namespace prototype
+{
+struct GpuKernelWriterAttribute;
+class IGpuKernelWriter;
+} // namespace prototype
+
+/** Kernel writer. */
+class KernelWriter
+{
+public:
+    // =============================================================================================
+    // Constructors and destructor
+    // =============================================================================================
+
+    /** Initialize a new instance of kernel writer.
+     *
+     * @param[in] kernel The kernel to be written to.
+     */
+    explicit KernelWriter(Kernel &kernel);
+
+    /** Destructor */
+    ~KernelWriter();
+
+    /** No copy constructor. */
+    KernelWriter(const KernelWriter &) = delete;
+
+    /** No copy assignment. */
+    KernelWriter &operator=(const KernelWriter &) = delete;
+
+    // =============================================================================================
+    // Scope management
+    // =============================================================================================
+
+    /** Get the current ID space. */
+    int32_t id_space() const;
+
+    /** Set the current ID space. */
+    KernelWriter &id_space(int32_t id_space);
+
+    /** Switch to and return a new ID space. */
+    int32_t next_id_space();
+
+    // =============================================================================================
+    // Tensor and tile declaration
+    // =============================================================================================
+
+    /** Define a tensor argument.
+     *
+     * @param[in] name The name of the tensor.
+     * @param[in] info The tensor info.
+     *
+     * @return The @ref TensorOperand object.
+     */
+    TensorOperand &create_tensor_argument(const char *name, const TensorInfo &info);
+
+    /** Define a compile-time constant scalar argument.
+     *
+     * @param[in] name  The name of the tile.
+     * @param[in] value The value of the tile.
+     *
+     * @return The @ref TileOperand object.
+     */
+    TileOperand &create_tile_argument(const char *name, int32_t value);
+
+    /** Declare a new tile.
+     *
+     * The name of the tile must be unique in the current ID space.
+     *
+     * @param[in] name The name of the tile.
+     * @param[in] ...  The necessary arguments to create a new @ref TileOperand.
+     *
+     * @return The @ref TileOperand object.
+     */
+    template <typename... TArgs>
+    TileOperand &declare_tile(const char *name, TArgs &&...args)
+    {
+        const auto var_name = generate_variable_name(name);
+        auto       operand  = new TileOperand(var_name, ::std::forward<TArgs>(args)...);
+        register_operand(operand, true);
+
+        return *operand;
+    }
+
+    // =============================================================================================
+    // Load and store
+    // =============================================================================================
+
+    /** Load the data from the tensor memory to the tile using the sampling information.
+     *
+     * @param[out] tile    The tile to be loaded.
+     * @param[in]  tensor  The tensor to be read.
+     * @param[in]  sampler The tensor sampling information.
+     */
+    void op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler);
+
+    /** Store the tile to the tensor using the specified sampling information.
+     *
+     * @param[out] dst     The tensor that the tile is written to.
+     * @param[in]  src     The tile to be stored.
+     * @param[in]  sampler The tensor sampling information.
+     */
+    void op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler);
+
+    // =============================================================================================
+    // Data processing
+    // =============================================================================================
+
+    /** Write assignment: `<dst> = <src>`.
+     *
+     * @param[in] dst The destination tile.
+     * @param[in] src The source tile.
+     */
+    void op_assign(TileOperand &dst, const TileOperand &src);
+
+    /** Write binary expression: `<dst> = <lhs> <op> <rhs>`.
+     *
+     * @param[in] dst The destination tile.
+     * @param[in] lhs The LHS operand.
+     * @param[in] rhs The RHS operand.
+     * @param[in] op  The binary operator.
+     */
+    void op_binary_expression(TileOperand &dst, const TileOperand &lhs, const TileOperand &rhs, BinaryOp op);
+
+    /** Write function applied to scalar value: `<dst> = <func>(<src>)`.
+     *
+     * @param[in] dst  The destination tile.
+     * @param[in] src  The source tile.
+     * @param[in] func The function to be applied to the source tile.
+     */
+    void op_scalar_function(TileOperand &dst, const TileOperand &src, ScalarUnaryFunction func);
+
+    // =============================================================================================
+    // Misc
+    // =============================================================================================
+
+    /** Set `dst` the global ID of dimension `dim`.
+     *
+     * @param[in] dst The tile to be written to.
+     * @param[in] dim The global ID dimension.
+     */
+    void op_get_global_id(TileOperand &dst, int32_t dim);
+
+    // =============================================================================================
+    // Code generation
+    // =============================================================================================
+
+    /** Generate the source code of the kernel. */
+    ::std::string generate_code();
+
+private:
+    /** Generate the full variable name based on the original name and the ID space.
+     *
+     * @param[in] name The name of the variable.
+     *
+     * @return The full variable name.
+     */
+    ::std::string generate_variable_name(const char *name) const;
+
+    /** Register the operand to the kernel.
+     *
+     * The operand is uniquely owned by the kernel afterward.
+     *
+     * @param[in] operand   The operand to be registered.
+     * @param[in] declaring Whether the tile declaration is generated.
+     */
+    void register_operand(OperandBase *operand, bool declaring);
+
+private:
+    Kernel                                                *_kernel;
+    ::std::unique_ptr<prototype::GpuKernelWriterAttribute> _impl_attr;
+    ::std::unique_ptr<prototype::IGpuKernelWriter>         _impl;
+
+    int32_t _id_space{ 0 };
+    int32_t _max_id_space{ 0 };
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/OperandBase.h b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
new file mode 100644
index 0000000000..f4825e16a7
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
+
+#include "ckw/Types.h"
+#include <string>
+
+namespace ckw
+{
+namespace prototype
+{
+class IGpuKernelWriter;
+class Operand;
+} // namespace prototype
+
+/** The base class for all operands. */
+class OperandBase
+{
+public:
+    /** Constructor
+     *
+     * @param[in] name The name of the operand.
+     */
+    explicit OperandBase(const ::std::string &name);
+
+    /** Destructor */
+    virtual ~OperandBase();
+
+    /** (Internal use only) Create the implementation operand.
+     *
+     * @param[in] writer The implementation kernel writer.
+     */
+    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const = 0;
+
+    /** Get the name of the operand. */
+    const ::std::string &name() const;
+
+    /** Set the name of the operand. */
+    OperandBase &name(const ::std::string &name);
+
+    /** Get the data type of the operand. */
+    virtual DataType data_type() const = 0;
+
+    /** Get whether the operand is compile-time constant. */
+    virtual bool is_constant() const = 0;
+
+private:
+    ::std::string _name;
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
new file mode 100644
index 0000000000..16c3f6d441
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
+
+#include "ckw/Error.h"
+
+#include <cstdint>
+
+namespace ckw
+{
+
+/** The scalar value known at compile-time. */
+class ScalarValue
+{
+public:
+    /** Initialize a new instance of @ref ScalarValue class with integer value 0. */
+    ScalarValue()
+    {
+        _type      = Type::INT;
+        _value.i64 = 0;
+    }
+
+    /** Initialize a new instance of @ref ScalarValue class with the specified value. */
+    template <typename T>
+    ScalarValue(T value)
+    {
+        set(value);
+    }
+
+    /** Set the value. */
+    template <typename T>
+    void set(T value)
+    {
+        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
+        CKW_ASSERT(sizeof(T) <= 8);
+
+        _size = sizeof(T);
+
+        if(::std::is_integral<T>::value)
+        {
+            if(::std::is_signed<T>::value)
+            {
+                _type      = Type::INT;
+                _value.i64 = value;
+            }
+            else
+            {
+                _type      = Type::UINT;
+                _value.u64 = value;
+            }
+        }
+        else
+        {
+            _type      = Type::FLOAT;
+            _value.f64 = value;
+        }
+    }
+
+    /** Get the value.
+     *
+     * The caller must make sure that what has been stored in the object must fit
+     * the output data type without data corruption or loss of accuracy.
+     */
+    template <typename T>
+    T get() const
+    {
+        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
+        CKW_ASSERT(sizeof(T) >= _size);
+
+        if(::std::is_integral<T>::value)
+        {
+            if(::std::is_signed<T>::value)
+            {
+                CKW_ASSERT(_type == Type::INT || _type == Type::UINT);
+                CKW_ASSERT_IF(_type == Type::UINT, sizeof(T) > _size);
+
+                return _value.i64;
+            }
+            else
+            {
+                CKW_ASSERT(_type == Type::INT);
+
+                return _value.u64;
+            }
+        }
+        else
+        {
+            return _value.f64;
+        }
+    }
+
+private:
+    union Value
+    {
+        int64_t  i64;
+        uint64_t u64;
+        double   f64;
+    };
+
+    enum class Type : int32_t
+    {
+        UINT,
+        INT,
+        FLOAT,
+    };
+
+    Value    _value{};
+    Type     _type{};
+    uint32_t _size{};
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
new file mode 100644
index 0000000000..00bb60a444
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H
+
+#include "ckw/Types.h"
+
+#include <array>
+#include <cstdint>
+
+namespace ckw
+{
+/** Compute Kernel Writer tensor data layout (or memory format) */
+enum class TensorDataLayout
+{
+    Unknown,
+    Nhwc,
+    Ndhwc
+};
+
+/** Compute Kernel Writer tensor data layout component */
+enum class TensorDataLayoutComponent
+{
+    Unknown,
+    N,
+    D,
+    H,
+    W,
+    C,
+};
+
+/** Compute Kernel Writer tensor component bitmask. The bitmask can be used to retrieve
+ *  the info from @ref TensorComponent.
+ */
+enum class TensorComponentBitmask : uint32_t
+{
+    OffsetFirstElement = 0x01000000, // For example, OffsetFirstElement in @ref TensorComponent
+    Stride             = 0x02000000, // For example, stride0 in @ref TensorComponent
+    Dimension          = 0x04000000, // For example, Dim0 in @ref TensorComponent
+    FoldedDimensions   = 0x08000000, // For example, Dim0xDim1 in @ref TensorComponent
+};
+
+/** Compute Kernel Writer tensor component. The tensor components are used to access specific backend-agnostic tensor arguments,
+ *  such as the tensor dimensions and tensor strides.
+ *  The data type is represented as an integer. The value of the integer value
+ *  is assigned to retrieve the information through the @ref TensorComponentBitmask.
+ */
+enum class TensorComponent : uint32_t
+{
+    Unknown            = 0x00000000,
+    OffsetFirstElement = 0x01000000,
+    Stride0            = 0x02000001,
+    Stride1            = 0x02000010,
+    Stride2            = 0x02000100,
+    Stride3            = 0x02001000,
+    Stride4            = 0x02010000,
+    Dim0               = 0x04000001,
+    Dim1               = 0x04000010,
+    Dim2               = 0x04000100,
+    Dim3               = 0x04001000,
+    Dim4               = 0x04010000,
+    Dim1xDim2          = 0x08000110,
+    Dim2xDim3          = 0x08001100,
+    Dim1xDim2xDim3     = 0x08001110
+};
+
+/** Compute Kernel Writer tensor storage. The tensor storage represents the type of tensor memory object.
+ */
+enum class TensorStorage : uint32_t
+{
+    Unknown            = 0x00000000,
+    BufferUint8Ptr     = 0x01000000,
+    Texture2dReadOnly  = 0x02000001,
+    Texture2dWriteOnly = 0x02000010,
+};
+
+/** Compute Kernel Writer tensor shape
+ *  Negative dimensions can be interpreted as dynamic dimensions by the Compute Kernel Writer
+ */
+using TensorShape = std::array<int32_t, 5>;
+
+/** Compute Kernel Writer tensor info */
+class TensorInfo
+{
+public:
+    /** Constructor
+     *
+     * @param[in] dt    Tensor data type
+     * @param[in] shape Tensor shape
+     * @param[in] dl    Tensor data layout
+     * @param[in] id    Tensor id. The id is used to keep track of the bound user tensor. Through the id,
+     *                  the user can know what tensor has been used by the Compute Kernel Writer.
+     *                  Possible id values:
+     *                  - greater than or equal to 0: bind a user specific tensors
+     *                  - less than 0: bind a virtual tensor (tile)
+     */
+    TensorInfo(DataType dt, const TensorShape &shape, TensorDataLayout dl, int32_t id);
+    /** Set shape */
+    TensorInfo &shape(const TensorShape &shape);
+    /** Get shape */
+    TensorShape shape() const;
+    /** Set data type */
+    TensorInfo &data_type(DataType dt);
+    /** Get data type */
+    DataType data_type() const;
+    /** Set data layout */
+    TensorInfo &data_layout(TensorDataLayout dl);
+    /** Get data layout */
+    TensorDataLayout data_layout() const;
+    /** Set id */
+    TensorInfo &id(int32_t id);
+    /** Get layout */
+    int32_t id() const;
+
+private:
+    TensorShape      _shape{ { 0 } };
+    DataType         _dt{ DataType::Unknown };
+    TensorDataLayout _dl{ TensorDataLayout::Unknown };
+    int32_t          _id{ -1 };
+};
+} // namespace kw
+
+#endif /* CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H */
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
new file mode 100644
index 0000000000..2fc5044d1c
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
+
+#include "ckw/OperandBase.h"
+#include "ckw/TensorInfo.h"
+#include "ckw/TensorTileSampler.h"
+#include "ckw/TileOperand.h"
+#include "ckw/Types.h"
+
+#include <memory>
+
+namespace ckw
+{
+
+class TensorComponentOperand;
+
+// =================================================================================================
+// TensorOperand
+// =================================================================================================
+
+/** Tensor operand */
+class TensorOperand : public OperandBase
+{
+public:
+    /** Initialize a new instance of @ref TensorOperand class.
+     *
+     * @param[in] name       The name of the tensor.
+     * @param[in] info       The tensor info.
+     */
+    TensorOperand(const ::std::string &name, const TensorInfo &info);
+
+    /** No copy constructor. */
+    TensorOperand(const TensorOperand &other) = delete;
+
+    /** No copy assignment. */
+    TensorOperand &operator=(const TensorOperand &other) = delete;
+
+    /** (Internal use only) Create the implementation operand.
+     *
+     * @param[in] writer The implementation kernel writer.
+     */
+    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
+
+    /** Get the tensor info. */
+    const TensorInfo &info() const;
+
+    /** Get the tensor info. */
+    TensorInfo &info();
+
+    /** Get the data type. */
+    virtual DataType data_type() const override;
+
+    /** Get whether the tensor is compile-time constant. */
+    virtual bool is_constant() const override;
+
+    /** Get the default tile attached to the tensor. */
+    const TileOperand &tile() const;
+
+    /** Get the default tile attached to the tensor. */
+    TileOperand &tile();
+
+    /** Set the default tile attached to the tensor. */
+    TensorOperand &tile(TileOperand &tile);
+
+    /** Get the tensor sampler of the default tile. */
+    const TensorTileSampler &tile_sampler() const;
+
+    /** Get the tensor sampler of the default tile. */
+    TensorTileSampler &tile_sampler();
+
+    /** Set the tensor sampler of the default tile. */
+    TensorOperand &tile_sampler(const TensorTileSampler &value);
+
+    /** Get the operand that contains the stride in y dimension of the tensor. */
+    TileOperand &stride1();
+
+    /** Get the operand that contains the stride in z dimension of the tensor. */
+    TileOperand &stride2();
+
+    /** Get the operand that contains the stride in w dimension of the tensor. */
+    TileOperand &stride3();
+
+    /** Get the operand that contains the stride in w dimension of the tensor. */
+    TileOperand &stride4();
+
+    /** Get the operand that contains the size of dimension 0 of the tensor. */
+    TileOperand &dim0();
+
+    /** Get the operand that contains the size of dimension 1 of the tensor. */
+    TileOperand &dim1();
+
+    /** Get the operand that contains the size of dimension 2 of the tensor. */
+    TileOperand &dim2();
+
+    /** Get the operand that contains the size of dimension 3 of the tensor. */
+    TileOperand &dim3();
+
+    /** Get the operand that contains the size of dimension 4 of the tensor. */
+    TileOperand &dim4();
+
+    /** Get the operand that contains the size of dimensions 1 and 2 collapsed. */
+    TileOperand &dim1_dim2();
+
+    /** Get the operand that contains the size of dimensions 1, 2 and 3 collapsed. */
+    TileOperand &dim1_dim2_dim3();
+
+    /** Get the operand that contains the offset in bytes to the first element. */
+    TileOperand &offset_first_element_in_bytes();
+
+private:
+    TensorInfo _info;
+
+    TileOperand  *_tile{ nullptr };
+    TensorTileSampler _tile_sampler{};
+
+    ::std::unique_ptr<TensorComponentOperand> _stride1{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _stride2{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _stride3{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _stride4{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim0{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim1{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim2{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim3{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim4{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{ nullptr };
+    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{ nullptr };
+};
+
+// =================================================================================================
+// TensorComponentOperand
+// =================================================================================================
+
+/** Tile operand that contains tensor information. */
+class TensorComponentOperand : public TileOperand
+{
+public:
+    /** Initialize a new instance of @ref TensorComponentOperand class.
+     *
+     * @param[in] name      The name of the operand.
+     * @param[in] component The tensor info component.
+     */
+    TensorComponentOperand(const ::std::string &name, TensorComponent component);
+
+    /** (Internal use only) Create the implementation operand.
+     *
+     * @param[in] writer The implementation kernel writer.
+     */
+    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
+
+private:
+    TensorComponent _component;
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
new file mode 100644
index 0000000000..2ea65bce9e
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
+
+#include "ckw/Types.h"
+#include <functional>
+
+namespace ckw
+{
+
+class TileOperand;
+
+/** Tensor sampler
+ *
+ * It contains information about how the result tile should be stored to tensor memory.
+ * It can also be used to dictate how the subsequent operators fetch the input tensor.
+ */
+class TensorTileSampler
+{
+public:
+    /** Initialize a new instance of @ref TensorSampler class. */
+    TensorTileSampler();
+
+    /** Initialize a new instance of @ref TensorSampler class.
+     *
+     * @param[in] x              The coordinate in the x dimension.
+     * @param[in] y              The coordinate in the y dimension.
+     * @param[in] z              The coordinate in the z dimension.
+     * @param[in] b              The coordinate in the batch dimension.
+     * @param[in] format         The tensor data format.
+     * @param[in] address_mode_x The address mode of the x dimension.
+     * @param[in] address_mode_y The address mode of the y dimension.
+     * @param[in] address_mode_z The address mode of the z dimension.
+     */
+    TensorTileSampler(
+        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
+        TensorSamplerFormat       format,
+        TensorSamplerAddressModeX address_mode_x,
+        TensorSamplerAddressModeY address_mode_y,
+        TensorSamplerAddressModeZ address_mode_z);
+
+    /** Initialize a new instance of @ref TensorSampler class.
+     *
+     * @param[in] x              The coordinate in the x dimension.
+     * @param[in] y              The coordinate in the y dimension.
+     * @param[in] z              The coordinate in the z dimension.
+     * @param[in] b              The coordinate in the batch dimension.
+     * @param[in] height         The height of the tile.
+     * @param[in] width          The width of the tile.
+     * @param[in] format         The tensor data format.
+     * @param[in] address_mode_x The address mode of the x dimension.
+     * @param[in] address_mode_y The address mode of the y dimension.
+     * @param[in] address_mode_z The address mode of the z dimension.
+     */
+    TensorTileSampler(
+        TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
+        int32_t height, int32_t width,
+        TensorSamplerFormat       format,
+        TensorSamplerAddressModeX address_mode_x,
+        TensorSamplerAddressModeY address_mode_y,
+        TensorSamplerAddressModeZ address_mode_z);
+
+    /** Get the coordinate in the x dimension. */
+    const TileOperand &x() const;
+
+    /** Set the coordinate in the x dimension. */
+    TensorTileSampler &x(TileOperand &x);
+
+    /** Get the coordinate in the y dimension. */
+    const TileOperand &y() const;
+
+    /** Set the coordinate in the y dimension. */
+    TensorTileSampler &y(TileOperand &y);
+
+    /** Get the coordinate in the z dimension. */
+    const TileOperand &z() const;
+
+    /** Set the coordinate in the z dimension. */
+    TensorTileSampler &z(TileOperand &z);
+
+    /** Get the coordinate in the batch dimension. */
+    const TileOperand &b() const;
+
+    /** Set the coordinate in the batch dimension. */
+    TensorTileSampler &b(TileOperand &b);
+
+    /** Get the width of the tile. */
+    int32_t width() const;
+
+    /** Set the width of the tile. */
+    TensorTileSampler &width(int32_t width);
+
+    /** Get the height of the tile. */
+    int32_t height() const;
+
+    /** Set the height of the tile. */
+    TensorTileSampler &height(int32_t height);
+
+    /** Get the format of the tensor. */
+    TensorSamplerFormat format() const;
+
+    /** Set the format of the tensor. */
+    TensorTileSampler &format(TensorSamplerFormat format);
+
+    /** Get the address mode of the x dimension. */
+    TensorSamplerAddressModeX address_mode_x() const;
+
+    /** Set the address mode of the x-dimension. */
+    TensorTileSampler &address_mode_x(TensorSamplerAddressModeX address_mode_x);
+
+    /** Get the address mode of the y dimension. */
+    TensorSamplerAddressModeY address_mode_y() const;
+
+    /** Set the address mode of the y dimension. */
+    TensorTileSampler &address_mode_y(TensorSamplerAddressModeY address_mode_y);
+
+    /** Get the address mode of the z dimension. */
+    TensorSamplerAddressModeZ address_mode_z() const;
+
+    /** Set the address mode of the z dimension. */
+    TensorTileSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
+
+private:
+    TileOperand *_x{ nullptr };
+    TileOperand *_y{ nullptr };
+    TileOperand *_z{ nullptr };
+    TileOperand *_b{ nullptr };
+
+    int32_t _height{ 0 };
+    int32_t _width{ 0 };
+
+    TensorSamplerFormat       _format{ TensorSamplerFormat::Unknown };
+    TensorSamplerAddressModeX _address_mode_x{ TensorSamplerAddressModeX::Unknown };
+    TensorSamplerAddressModeY _address_mode_y{ TensorSamplerAddressModeY::Unknown };
+    TensorSamplerAddressModeZ _address_mode_z{ TensorSamplerAddressModeZ::Unknown };
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TileInfo.h b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
new file mode 100644
index 0000000000..8fba8bb827
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TILEINFO_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_TILEINFO_H
+
+#include "ckw/Types.h"
+
+#include <array>
+#include <cstdint>
+
+namespace ckw
+{
+// Constants to access the tile width and height in the TileShape
+constexpr int32_t kTileWidthIdx  = 0;
+constexpr int32_t kTileHeightIdx = 1;
+
+/** Compute Kernel Writer tile shape. It is used to define the shape of the tile */
+using TileShape = std::array<int32_t, 2>;
+
+/** Compute Kernel Writer tile info */
+class TileInfo
+{
+public:
+    /** Constructor used to initialize a scalar variable with a given data type
+     *
+     * @param[in] dt Tile data type
+     */
+    TileInfo(DataType dt);
+    /** Constructor used to initialize a vector with a given data type and vector length.
+     *
+     * @param[in] dt Tile data type
+     * @param[in] w  Tile width (or vector length)
+     */
+    TileInfo(DataType dt, int32_t w);
+    /** Constructor used to initialize a tile with a given data type and tile sizes.
+     *
+     * @param[in] dt Tile data type
+     * @param[in] h  Tile height
+     * @param[in] w  Tile width
+     */
+    TileInfo(DataType dt, int32_t h, int32_t w);
+    /** Set width */
+    TileInfo &width(int32_t w);
+    /** Get width */
+    int32_t width() const;
+    /** Set height */
+    TileInfo &height(int32_t h);
+    /** Get height */
+    int32_t height() const;
+    /** Set data type */
+    TileInfo &data_type(DataType dt);
+    /** Get data type */
+    DataType data_type() const;
+
+private:
+    DataType  _dt{ DataType::Unknown };
+    TileShape _shape{};
+};
+
+} // namespace ckw
+
+#endif /* COMPUTE_KERNEL_WRITER_INCLUDE_CKW_TILEINFO_H */
diff --git a/compute_kernel_writer/prototype/include/ckw/TileOperand.h b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
new file mode 100644
index 0000000000..c071707a45
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
+
+#include "ckw/Error.h"
+#include "ckw/OperandBase.h"
+#include "ckw/ScalarValue.h"
+#include "ckw/TileInfo.h"
+
+#include <vector>
+
+namespace ckw
+{
+
+class Kernel;
+
+/** Tile operand which can be either scalar, vector or 2D tile. */
+class TileOperand : public OperandBase
+{
+public:
+    /** Initialize a new instance of @ref TileOperand class with the tile information.
+     *
+     * @param[in] name      The name of the tile.
+     * @param[in] tile_info The tile info.
+     */
+    TileOperand(const ::std::string &name, const TileInfo &tile_info);
+
+    /** Initialize a new instance of @ref TileOperand for scalar variable.
+     *
+     * @param[in] name      The name of the tile.
+     * @param[in] data_type The data type of the tile.
+     */
+    TileOperand(const ::std::string &name, DataType data_type);
+
+    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
+     *
+     * @param[in] name  The name of the tile.
+     * @param[in] value The value of the tile.
+     */
+    TileOperand(const ::std::string &name, int32_t value);
+
+    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
+     *
+     * @param[in] name  The name of the tile.
+     * @param[in] value The value of the tile.
+     */
+    TileOperand(const ::std::string &name, float value);
+
+    /** Prohibit copy of tile operand. */
+    TileOperand(const TileOperand &) = delete;
+
+    /** Prohibit copy of tile operand. */
+    TileOperand &operator=(const TileOperand &) = delete;
+
+    /** (Internal use only) Create the implementation operand.
+     *
+     * @param[in] writer The implementation kernel writer.
+     */
+    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
+
+    /** Get the tile info. */
+    const TileInfo &tile_info() const;
+
+    /** Get the data type of the tile. */
+    virtual DataType data_type() const override;
+
+    /** Get whether the tile is compile-time constant. */
+    virtual bool is_constant() const override;
+
+    /** Get whether the tile is a scalar value. */
+    bool is_scalar() const;
+
+    /** Get the scalar value of the tile.
+     *
+     * The tile must have the shape of 1, 1 (i.e. scalar).
+     */
+    ScalarValue scalar_value() const;
+
+private:
+    TileInfo    _info;
+    ScalarValue _value{};
+    bool        _constant;
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
diff --git a/compute_kernel_writer/prototype/include/ckw/Types.h b/compute_kernel_writer/prototype/include/ckw/Types.h
new file mode 100644
index 0000000000..bb5d7ce077
--- /dev/null
+++ b/compute_kernel_writer/prototype/include/ckw/Types.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TYPES_H
+#define CKW_PROTOTYPE_INCLUDE_CKW_TYPES_H
+
+#include <array>
+#include <cstdint>
+
+namespace ckw
+{
+
+/** Compute Kernel Writer data types. This data type is used by the code variables and tensor arguments. */
+enum class DataType
+{
+    Unknown = 0x00,
+    Fp32    = 0x11,
+    Fp16    = 0x12,
+    Int32   = 0x21,
+    Int16   = 0x22,
+    Int8    = 0x24,
+    Uint32  = 0x31,
+    Uint16  = 0x32,
+    Uint8   = 0x34,
+    Bool    = 0x41
+};
+
+enum class GpuTargetLanguage
+{
+    Unknown,
+    OpenCL
+};
+
+/* Binary operations
+*/
+enum class BinaryOp : int32_t
+{
+    // Elementwise
+    Add = 0x0000, // +
+    Sub = 0x0001, // -
+    Mul = 0x0002, // *
+    Div = 0x0003, // /
+    Mod = 0x0004, // %
+    // Relational
+    Equal        = 0x1000, // ==
+    Less         = 0x1001, // <
+    LessEqual    = 0x1002, // <=
+    Greater      = 0x1003, // >
+    GreaterEqual = 0x1004, // >=
+    // Algebra
+    MatMul_Nt_Nt = 0x2000, // X
+    MatMul_Nt_T  = 0x2001, // X
+    MatMul_T_Nt  = 0x2002, // X
+    MatMul_T_T   = 0x2003, // X
+    Dot          = 0x2004, // .
+    // Logical
+    LogicalAnd = 0x3000, // &&
+    LogicalOr  = 0x3001, // ||
+    LogicalNot = 0x3002  // !
+};
+
+enum class AssignmentOp : int32_t
+{
+    // Unary
+    Increment = 0x0000, // +=
+    Decrement = 0x0001, // -=
+};
+
+enum class ScalarUnaryFunction : int32_t
+{
+    Exp,
+};
+
+enum class TensorSamplerFormat : int32_t
+{
+    Unknown = 0,
+    C_WH_1  = 1,
+    C_W_H   = 2
+};
+
+enum class TensorSamplerAddressModeX : int32_t
+{
+    Unknown        = 0,
+    None           = 1, // The user guarantees that the X coordinate is always in-bound
+    OverlappingMin = 2  // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
+                        // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
+};
+
+enum class TensorSamplerAddressModeY : int32_t
+{
+    Unknown                  = 0,
+    None                     = 1, // The user guarantees that the Y coordinate is always in-bound
+    OverlappingMin           = 2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
+    Skip                     = 3, // Skip the read/write
+    SkipMinEdgeOnly          = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
+    SkipMaxEdgeOnly          = 5, // Skip less than 0 only
+    ClampToNearest           = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
+    ClampToMinEdgeOnly       = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
+    ClampToMaxEdgeOnly       = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
+    ClampToBorder            = 9, // Clamp to border which always has 0 value
+    ClampToBorderMinEdgeOnly = 10,
+    ClampToBorderMaxEdgeOnly = 11
+};
+
+enum class TensorSamplerAddressModeZ : int32_t
+{
+    Unknown            = 0,
+    None               = 1, // The user guarantees that the Y coordinate is always in-bound
+    Skip               = 3, // Skip the read/write
+    SkipMinEdgeOnly    = 4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
+    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
+    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
+    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
+    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
+};
+
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_INCLUDE_CKW_TYPES_H
diff --git a/compute_kernel_writer/prototype/src/Kernel.cpp b/compute_kernel_writer/prototype/src/Kernel.cpp
new file mode 100644
index 0000000000..bbf5c440a7
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/Kernel.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/Kernel.h"
+#include "ckw/Types.h"
+#include "src/Prototype.h"
+
+namespace ckw
+{
+
+Kernel::Kernel(const char *name, GpuTargetLanguage language)
+    : _name(name), _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)), _operands{}
+{
+}
+
+Kernel::~Kernel()
+{
+}
+
+const std::string &Kernel::name() const
+{
+    return _name;
+}
+
+const std::map<std::string, std::unique_ptr<OperandBase>> &Kernel::operands() const
+{
+    return _operands;
+}
+
+std::map<std::string, std::unique_ptr<OperandBase>> &Kernel::operands()
+{
+    return _operands;
+}
+
+prototype::GpuKernelWriterDataHolder *Kernel::impl()
+{
+    return _kernel.get();
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
new file mode 100644
index 0000000000..5d79985e87
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/KernelWriter.cpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/KernelWriter.h"
+#include "ckw/Error.h"
+#include "ckw/TensorOperand.h"
+#include "src/Prototype.h"
+
+#include <sstream>
+
+namespace ckw
+{
+
+namespace
+{
+
+inline prototype::TensorInfo create_impl_tensor_info(const TensorInfo &info)
+{
+    return prototype::TensorInfo{ info.shape(), info.data_type(), info.data_layout(), info.id() };
+}
+
+} // namespace
+
+// =================================================================================================
+// Constructors and destructor
+// =================================================================================================
+
+KernelWriter::KernelWriter(Kernel &kernel)
+    : _kernel(&kernel),
+      _impl_attr(std::make_unique<prototype::GpuKernelWriterAttribute>()),
+      _impl(prototype::GpuKernelWriterFactory::create(_impl_attr.get(), kernel.impl()))
+{
+    _impl->set_IdSpace(1);
+}
+
+KernelWriter::~KernelWriter()
+{
+}
+
+// =================================================================================================
+// Scope management
+// =================================================================================================
+
+int32_t KernelWriter::id_space() const
+{
+    return _id_space;
+}
+
+KernelWriter &KernelWriter::id_space(int32_t id_space)
+{
+    CKW_ASSERT(id_space <= _max_id_space);
+
+    _id_space = id_space;
+    return *this;
+}
+
+int32_t KernelWriter::next_id_space()
+{
+    id_space(++_max_id_space);
+    return _id_space;
+}
+
+// =================================================================================================
+// Tensor and tile declaration
+// =================================================================================================
+
+TensorOperand &KernelWriter::create_tensor_argument(const char *name, const TensorInfo &info)
+{
+    const auto var_name = generate_variable_name(name);
+
+    _impl->declare_argument(var_name, create_impl_tensor_info(info));
+
+    auto operand = new TensorOperand(var_name, info);
+    register_operand(operand, false);
+
+    return *operand;
+}
+
+TileOperand &KernelWriter::create_tile_argument(const char *name, int32_t value)
+{
+    const auto var_name = generate_variable_name(name);
+
+    auto operand = new TileOperand(var_name, value);
+    register_operand(operand, false);
+
+    return *operand;
+}
+
+std::string KernelWriter::generate_variable_name(const char *name) const
+{
+    std::stringstream var_name;
+
+    var_name << "_" << _id_space << "_" << name;
+
+    return var_name.str();
+}
+
+void KernelWriter::register_operand(OperandBase *operand, bool declaring)
+{
+    const auto &name     = operand->name();
+    auto       &operands = _kernel->operands();
+
+    CKW_ASSERT(operands.find(name) == operands.end());
+    operands[name] = std::unique_ptr<OperandBase>(operand);
+
+    if(declaring && !operand->is_constant())
+    {
+        const auto tile = reinterpret_cast<TileOperand *>(operand);
+
+        const auto &info = tile->tile_info();
+        _impl->declare_tile(tile->name(), prototype::TileInfo(info.data_type(), info.width(), info.height()));
+    }
+}
+
+// =================================================================================================
+// Load and store
+// =================================================================================================
+
+void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler)
+{
+    prototype::TensorOperand impl_tensor(
+        tensor.name(),
+        prototype::GpuSampler{
+            sampler.format(),
+            prototype::GpuSamplerTensorStorage::BufferUint8Ptr,
+            sampler.address_mode_x(),
+            sampler.address_mode_y(),
+            sampler.address_mode_z() });
+
+    auto impl_x = sampler.x().create_impl_operand(_impl.get());
+    auto impl_y = sampler.y().create_impl_operand(_impl.get());
+    auto impl_z = sampler.z().create_impl_operand(_impl.get());
+    auto impl_b = sampler.b().create_impl_operand(_impl.get());
+
+    auto impl_dst = tile.create_impl_operand(_impl.get());
+
+    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+}
+
+void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
+{
+    prototype::TensorOperand impl_tensor(
+        tensor.name(),
+        prototype::GpuSampler{
+            sampler.format(),
+            prototype::GpuSamplerTensorStorage::BufferUint8Ptr,
+            sampler.address_mode_x(),
+            sampler.address_mode_y(),
+            sampler.address_mode_z() });
+    auto impl_src = tile.create_impl_operand(_impl.get());
+    auto impl_x   = sampler.x().create_impl_operand(_impl.get());
+    auto impl_y   = sampler.y().create_impl_operand(_impl.get());
+    auto impl_z   = sampler.z().create_impl_operand(_impl.get());
+    auto impl_b   = sampler.b().create_impl_operand(_impl.get());
+
+    _impl->op_store_immediate(impl_tensor, impl_src, impl_x, impl_y, impl_z, impl_b);
+}
+
+// =================================================================================================
+// Data processing
+// =================================================================================================
+
+void KernelWriter::op_assign(TileOperand &dst, const TileOperand &src)
+{
+    auto impl_dst = dst.create_impl_operand(_impl.get());
+    auto impl_src = src.create_impl_operand(_impl.get());
+
+    _impl->op_assign(impl_dst, impl_src);
+}
+
+void KernelWriter::op_binary_expression(TileOperand &dst, const TileOperand &lhs, const TileOperand &rhs, BinaryOp op)
+{
+    auto impl_lhs = lhs.create_impl_operand(_impl.get());
+    auto impl_rhs = rhs.create_impl_operand(_impl.get());
+    auto impl_dst = dst.create_impl_operand(_impl.get());
+
+    _impl->op_binary_expression(impl_dst, impl_lhs, op, impl_rhs);
+}
+
+void KernelWriter::op_scalar_function(TileOperand &dst, const TileOperand &src, ScalarUnaryFunction opcode)
+{
+    auto impl_dst = dst.create_impl_operand(_impl.get());
+    auto impl_src = src.create_impl_operand(_impl.get());
+
+    _impl->op_scalar_function(impl_dst, impl_src, opcode);
+}
+
+// =================================================================================================
+// Misc
+// =================================================================================================
+
+void KernelWriter::op_get_global_id(TileOperand &dst, int32_t dim)
+{
+    _impl->op_get_global_id(prototype::Operand(dst.name()), dim);
+}
+
+// =================================================================================================
+// Code generation
+// =================================================================================================
+
+std::string KernelWriter::generate_code()
+{
+    return prototype::generate_code(*_kernel->impl(), _kernel->name());
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/OperandBase.cpp b/compute_kernel_writer/prototype/src/OperandBase.cpp
new file mode 100644
index 0000000000..59cf846cc7
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/OperandBase.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/OperandBase.h"
+
+namespace ckw
+{
+
+OperandBase::OperandBase(const std::string &name)
+    : _name(name)
+{
+}
+
+OperandBase::~OperandBase()
+{
+}
+
+const std::string &OperandBase::name() const
+{
+    return _name;
+}
+
+OperandBase &OperandBase::name(const std::string &name)
+{
+    _name = name;
+    return *this;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
new file mode 100644
index 0000000000..b17481537f
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -0,0 +1,3767 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
+#define CKW_PROTOTYPE_SRC_PROTOTYPE_H
+
+#include <vector>
+#include <map>
+#include <string>
+#include <cstdint> // int32_t
+#include <iostream> // cout (to be removed)
+#include <cassert>  // assert (to be removed)
+#include <unordered_map>
+#include <chrono>
+#include <cmath>
+#include <memory>
+#include <algorithm>
+#include <array>
+#include <stdexcept>
+
+#include "ckw/Types.h"
+#include "ckw/TensorInfo.h"
+#include "ckw/Error.h"
+
+namespace ckw
+{
+namespace prototype {
+
+// Dummy data structure for Size2D
+using Size2D = std::vector<int32_t>;
+
+// Dummy Status
+using Status = void;
+
+enum class ComponentType : int32_t
+{
+    Complex   = 0,
+    Simple    = 1,
+    Unfusable = 2
+};
+
+enum class GpuCompilationSpeed
+{
+    Fast   = 0x00,  // fast compilation may increase the latency of the network
+    Slow   = 0x01   // slow compilation may decrease the latency of the network
+};
+
+enum class GpuExtensions
+{
+    Fp16,
+    Dot8,
+    Mmul,
+    FastMath
+};
+
+struct TensorInfo
+{
+    TensorShape shape { {0} };
+    DataType    data_type { DataType::Unknown };
+    TensorDataLayout  data_layout { TensorDataLayout::Nhwc };
+    int32_t     id { -1 };
+};
+
+struct ComponentAttribute
+{
+    GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
+    bool                overwrite_tile { true };
+};
+
+inline std::string data_type_to_cl_type(DataType dt)
+{
+    switch(dt)
+    {
+        case DataType::Fp32:
+            return "float";
+        case DataType::Fp16:
+            return "half";
+        case DataType::Int8:
+            return "char";
+        case DataType::Uint8:
+            return "uchar";
+        case DataType::Uint16:
+            return "ushort";
+        case DataType::Int16:
+            return "short";
+        case DataType::Uint32:
+            return "uint";
+        case DataType::Int32:
+            return "int";
+        case DataType::Bool:
+            return "bool";
+        default:
+            assert(false);
+            return "";
+    }
+}
+
+inline int32_t width_to_cl_vector_size(int32_t width)
+{
+    switch(width)
+    {
+        case 1:
+            return 1;
+        case 2:
+            return 2;
+        case 3:
+            return 3;
+        case 4:
+            return 4;
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+            return 8;
+        case 9:
+        case 10:
+        case 11:
+        case 12:
+        case 13:
+        case 14:
+        case 15:
+        case 16:
+            return 16;
+        default:
+            assert(false);
+            return 0;
+    }
+}
+
+inline std::string get_cl_data_type(DataType dt, int32_t width)
+{
+    std::string data_type;
+    int32_t w = width_to_cl_vector_size(width);
+    data_type += data_type_to_cl_type(dt);
+    if(w != 1)
+    {
+        data_type += std::to_string(w);
+    }
+    return data_type;
+}
+
+inline std::string to_opencl_store(int32_t vector_length)
+{
+    if(vector_length != 1)
+    {
+        return "vstore" + std::to_string(vector_length) + "(";
+    }
+    else
+    {
+        return "*(";
+    }
+}
+
+struct TileInfo
+{
+    TileInfo() {}
+    TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
+    TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
+    TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
+    DataType dt{ DataType::Unknown }; // Data type of the tile
+    int32_t  w{ 0 };  // Width (i.e. c0 - portion of the channels)
+    int32_t  h{ 0 };  // Height (i.e. s0 - portion of the spatial dimensions)
+};
+
+inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
+{
+    o << a.w << " x " << a.h;
+    return o;
+}
+
+struct DataTypeAsString
+{
+    std::string str { "" };
+    DataType    dt { DataType::Unknown };
+    int32_t     size { 1 };
+};
+
+struct ValueAsString
+{
+    std::string      str { "" };
+    DataTypeAsString type { };
+};
+
+// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
+// A Tile is a collection of variables used to express a 2D data.
+class IScalarTile
+{
+public:
+    virtual ~IScalarTile() = default;
+    /** Method to get the scalar variable from a tile
+     * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
+     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
+     *
+     * @return the scalar variable as a string
+     */
+    virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
+    /** Method to get the list of underlying variable names used by the tile
+     *
+     * @return the list of variable names
+     */
+    virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
+    /** Method to get the name of the tile.
+     *
+     * @return the name of the tile
+     */
+    std::string name() const
+    {
+        return _basename;
+    }
+    /** Method to get the tile format
+     *
+     * @return the format
+     */
+    TileInfo format() const
+    {
+        return _format;
+    }
+    /** Method to know whether the tile is assignable or not (constant)
+     *
+     * @return true if the tile is assignable
+     */
+    virtual bool is_assignable() const = 0;
+    /** Method to know whether the tile needs to be declared
+     *
+     * @return true if the tile needs to be declared in the code before being used
+     */
+    virtual bool need_declaration() const = 0;
+protected:
+    TileInfo  _format { };       // Tile format
+    std::string _basename { "" };  // Tile name
+};
+
+// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
+// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
+class IVectorTile : public IScalarTile
+{
+public:
+    virtual ~IVectorTile() = default;
+    /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
+     *  The user can query the list of supported width for the vectors through preferred_vector_sizes().
+     *
+     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
+     *
+     * @return the vector variable as a string
+     */
+    virtual ValueAsString vector(int32_t y) const = 0;
+    /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
+     *
+     * @return the vector variable as a string
+     */
+    virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
+    /** Method to get the preferred vector sizes.
+     *
+     * @return a vector with the preferred vector sizes
+     */
+    //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
+};
+
+class ClTile : public IVectorTile
+{
+public:
+    ClTile(const std::string& name, TileInfo format)
+    {
+        _format = format;
+        _basename = name;
+    }
+
+    ValueAsString scalar(int32_t x, int32_t y) const override
+    {
+        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
+        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
+
+        ValueAsString t;
+        t.str       = build_variable_name(y);
+        t.type.str  = get_cl_data_type(_format.dt, 1);
+        t.type.dt   = _format.dt;
+        t.type.size = 1;
+
+        // Check required because if the width has only one element, we cannot use .s0
+        if(_format.w != 1)
+        {
+            // Automatic broadcasting
+            t.str += ".s" + std::to_string(x);
+        }
+
+        return t;
+    }
+
+    ValueAsString vector(int32_t y) const override
+    {
+        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
+
+        ValueAsString t;
+        t.str       = build_variable_name(y);
+        t.type.str  = get_cl_data_type(_format.dt, _format.w);
+        t.type.dt   = _format.dt;
+        t.type.size = _format.w;
+        return t;
+    }
+
+    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
+    {
+        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
+
+        ValueAsString t;
+        t.str       = build_variable_name(y);
+        t.type.str  = get_cl_data_type(_format.dt, width);
+        t.type.dt   = _format.dt;
+        t.type.size = width;
+
+        if(_format.w != 1)
+        {
+            t.str += ".s";
+            for(int i = 0; i < width; ++i)
+            {
+                t.str += to_scalar_hex(x_start + i);
+            }
+        }
+        return t;
+    }
+
+    std::vector<ValueAsString> underlying_source_variables() const override
+    {
+        std::vector<ValueAsString> vars;
+        for(int32_t y = 0; y < _format.h; ++y)
+        {
+            ValueAsString t;
+            t.str       = build_variable_name(y);
+            t.type.str  = get_cl_data_type(_format.dt, _format.w);
+            t.type.dt   = _format.dt;
+            t.type.size = _format.w;
+            vars.push_back(t);
+        }
+        return vars;
+    }
+
+    bool is_assignable() const override
+    {
+        return true;
+    }
+
+    bool need_declaration() const override
+    {
+        return true;
+    }
+
+private:
+    std::string build_variable_name(int32_t y) const
+    {
+        std::string var_name = _basename;
+
+        if(_format.h == 1)
+        {
+            return var_name;
+
+        }
+        else
+        {
+            var_name += "_";
+            var_name += std::to_string(y);
+        }
+
+        return var_name;
+    }
+
+    std::string to_scalar_hex(int32_t x) const
+    {
+        switch(x)
+        {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+            case 8:
+            case 9:
+                return std::to_string(x);
+            case 10:
+                return "A";
+            case 11:
+                return "B";
+            case 12:
+                return "C";
+            case 13:
+                return "D";
+            case 14:
+                return "E";
+            case 15:
+                return "F";
+            default:
+                std::cout << "Unsupported hexadecimal value" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+};
+
+// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
+class ClConstantTile : public IVectorTile
+{
+public:
+    ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
+    {
+        _format.w  = in[0].size();
+        _format.h  = in.size();
+        _format.dt = dt;
+
+        _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
+
+        for(int32_t y = 0; y < _format.h; ++y)
+        {
+            for(int32_t x = 0; x < _format.w; ++x)
+            {
+                _data[y][x] = in[y][x];
+            }
+        }
+    }
+
+    ValueAsString scalar(int32_t x, int32_t y) const override
+    {
+        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
+        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
+
+        ValueAsString t;
+        t.str       = _data[y][x];
+        t.type.str  = get_cl_data_type(_format.dt, 1);
+        t.type.dt   = _format.dt;
+        t.type.size = 1;
+
+        return t;
+    }
+
+    ValueAsString vector(int32_t y) const override
+    {
+        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
+
+        return vector(0, _format.w, y);
+    }
+
+    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
+    {
+        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
+
+        ValueAsString t;
+        t.str       = "";
+        t.type.str  = get_cl_data_type(_format.dt, width);
+        t.type.dt   = _format.dt;
+        t.type.size = width;
+
+        if(width > 1)
+        {
+            t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
+        }
+
+        int32_t x = x_start;
+        for(; x < width - 1; ++x)
+        {
+            t.str += scalar(x, y).str;
+            t.str += ", ";
+        }
+        t.str += scalar(x, y).str;
+
+        if(width > 1)
+        {
+            t.str += "))";
+        }
+
+        return t;
+    }
+
+    std::vector<ValueAsString> underlying_source_variables() const override
+    {
+        std::vector<ValueAsString> vars;
+
+        for(int32_t y = 0; y < _format.h; ++y)
+        {
+            for(int32_t x = 0; x < _format.w; ++x)
+            {
+                ValueAsString t;
+                t.str       = _data[y][x];
+                t.type.str  = get_cl_data_type(_format.dt, 1);
+                t.type.dt   = _format.dt;
+                t.type.size = 1;
+                vars.push_back(t);
+            }
+        }
+
+        return vars;
+    }
+
+    bool is_assignable() const override
+    {
+        return false;
+    }
+
+    bool need_declaration() const override
+    {
+        return false;
+    }
+
+private:
+    std::vector<std::vector<std::string>> _data{};
+};
+
+enum class TensorComponentIndex : int32_t
+{
+    IndexMask = 0x0000000f,
+};
+
+enum class TensorComponentType : int32_t
+{
+    OffsetFirstElement = 0x00000100,
+    Stride             = 0x00001000,
+    Dimension          = 0x00010000,
+    FoldedDimension    = 0x00100000,
+    Constant           = 0x01000000
+};
+
+enum class TensorComponent : int32_t
+{
+    Unknown             = 0x00000000,
+    OffsetFirstElement  = 0x00000100,
+    Stride1             = 0x00001001,
+    Stride2             = 0x00001002,
+    Stride3             = 0x00001003,
+    Stride4             = 0x00001004,
+    Dim0                = 0x00010000,
+    Dim1                = 0x00010001,
+    Dim2                = 0x00010002,
+    Dim3                = 0x00010003,
+    Dim4                = 0x00010004,
+    C                   = 0x00010000,   // Dim0
+    W                   = 0x00010001,   // Dim1
+    H                   = 0x00010002,   // Dim2
+    D                   = 0x00010003,
+    N                   = 0x00010004,
+    Dim1xDim2           = 0x00100021,
+    Dim1xDim2xDim3      = 0x00100321,
+    WxH                 = 0x00100021,
+    WxHxD               = 0x00100321
+};
+
+inline std::string to_string(TensorComponent x)
+{
+    switch(x)
+    {
+        case TensorComponent::Unknown:
+        return "Unknown";
+        case TensorComponent::OffsetFirstElement:
+        return "OffsetFirstElement";
+        case TensorComponent::Stride1:
+        return "Stride1";
+        case TensorComponent::Stride2:
+        return "Stride2";
+        case TensorComponent::Stride3:
+        return "Stride3";
+        case TensorComponent::Stride4:
+        return "Stride4";
+        case TensorComponent::Dim0:
+        return "Dim0";
+        case TensorComponent::Dim1:
+        return "Dim1";
+        case TensorComponent::Dim2:
+        return "Dim2";
+        case TensorComponent::Dim3:
+        return "Dim3";
+        case TensorComponent::Dim4:
+        return "Dim4";
+        case TensorComponent::Dim1xDim2:
+        return "Dim1xDim2";
+        case TensorComponent::Dim1xDim2xDim3:
+        return "Dim1xDim2xDim3";
+        default:
+        assert(false);
+    }
+}
+
+class ITensorArgument
+{
+public:
+    virtual ~ITensorArgument() = default;
+    /** Method to get the tensor component as a string
+     *
+     * @param[in] x tensor component to query
+     *
+     * @return  the tensor component as a string
+     */
+    virtual std::string component(TensorComponent x) = 0;
+    /** Method to get the tensor component type declaration as a string
+     *
+     * @return  the tensor component type declaration as a string
+     */
+    virtual std::string component_type_declaration() const = 0;
+    /** Method to get the tensor component data type
+     *
+     * @return  the tensor component data type
+     */
+    virtual DataType component_data_type() const = 0;
+    /** Method to get the tensor component declarations
+     *
+     * @return a vector containing the tensor component declarations
+     */
+    virtual std::vector<TensorComponent> component_declarations() const = 0;
+    /** Method to get the name of the tensor argument.
+     *
+     * @return the name of the tensor argument
+     */
+    std::string name() const
+    {
+        return _basename;
+    }
+    /** Method to get the tensor format
+     *
+     * @return the format
+     */
+    TensorInfo format() const
+    {
+        return _format;
+    }
+
+protected:
+    TensorInfo   _format { };
+    std::string  _basename {};
+};
+
+enum class GpuTensorStorage : int32_t
+{
+    Unknown          = 0x0000,
+    BufferUint8Ptr   = 0x0012,
+    Image2dReadOnly  = 0x0020,
+    Image2dWriteOnly = 0x0021,
+    Image3dReadOnly  = 0x0030,
+    Image3dWriteOnly = 0x0031
+};
+
+class IGpuTensorArgument : public ITensorArgument
+{
+public:
+    virtual ~IGpuTensorArgument() = default;
+    /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
+     *
+     * @param[in] x tensor storage to query
+     *
+     * @return  the tensor storage as a string
+     */
+    virtual std::string storage(GpuTensorStorage x) = 0;
+    /** Method to get the tensor storage type declaration as a string
+     *
+     * @param[in] x tensor component to query
+     *
+     * @return  the tensor storage type declaration as a string
+     */
+    virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
+    /** Method to get the tensor storage declarations
+     *
+     * @return a vector containing the tensor storage declarations
+     */
+    virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
+};
+
+class ClTensorArgument : public IGpuTensorArgument
+{
+public:
+    ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
+    {
+        _basename = name;
+        _format   = x;
+        _return_by_value_when_possible = return_by_value_when_possible;
+    }
+
+    // Methods to override
+    std::string component(TensorComponent x) override
+    {
+        if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
+        {
+            int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
+            return std::to_string(idx - 1);
+        }
+
+        if(_return_by_value_when_possible)
+        {
+            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
+            {
+                int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
+                return std::to_string(_format.shape[idx]);
+            }
+
+            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
+            {
+                switch(x)
+                {
+                    case TensorComponent::Dim1xDim2:
+                    return std::to_string(_format.shape[1] * _format.shape[2]);
+                    case TensorComponent::Dim1xDim2xDim3:
+                    return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
+                    default:
+                    std::cout << "Unsupported folded dimension" << std::endl;
+                    assert(false);
+                }
+            }
+        }
+
+        if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
+        {
+            _components_required.push_back(x);
+        }
+
+        return build_component_name(x);
+    }
+
+    std::string component_type_declaration() const override
+    {
+        return "int";
+    };
+
+    DataType component_data_type() const override
+    {
+        return DataType::Int32;
+    }
+
+    std::string storage(GpuTensorStorage x) override
+    {
+        if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
+        {
+            _storage_required.push_back(x);
+        }
+
+        return build_storage_name(x);
+    }
+
+    std::string storage_type_declaration(GpuTensorStorage x) const override
+    {
+        switch(x)
+        {
+            case GpuTensorStorage::BufferUint8Ptr:
+                return "__global uchar*";
+            case GpuTensorStorage::Image2dReadOnly:
+                return "__read_only image2d_t";
+            case GpuTensorStorage::Image2dWriteOnly:
+                return "__write_only image2d_t";
+            case GpuTensorStorage::Image3dReadOnly:
+                return "__read_only image3d_t ";
+            case GpuTensorStorage::Image3dWriteOnly:
+                return "__write_only image3d_t ";
+            default:
+                std::cout << "Unsupported storage" << std::endl;
+                assert(false);
+                return "";
+        }
+    };
+
+    std::vector<GpuTensorStorage> storage_declarations() const override
+    {
+        return _storage_required;
+    }
+
+    std::vector<TensorComponent> component_declarations() const override
+    {
+        return _components_required;
+    }
+
+private:
+    std::string build_storage_name(GpuTensorStorage x) const
+    {
+        std::string var_name = _basename;
+
+        switch(x)
+        {
+            case GpuTensorStorage::BufferUint8Ptr:
+                return var_name + "_ptr";
+            case GpuTensorStorage::Image2dReadOnly:
+            case GpuTensorStorage::Image2dWriteOnly:
+                return var_name + "_img2d";
+            case GpuTensorStorage::Image3dReadOnly:
+            case GpuTensorStorage::Image3dWriteOnly:
+                return var_name + "_img3d";
+            default:
+                std::cout << "Unsupported storage" << std::endl;
+                assert(false);
+        }
+
+        return var_name;
+    }
+
+    std::string build_component_name(TensorComponent x) const
+    {
+        std::string var_name = _basename;
+
+        switch(x)
+        {
+            case TensorComponent::OffsetFirstElement:
+                return var_name + "_offset_first_element";
+            case TensorComponent::Stride1:
+                return var_name + "_stride1";
+            case TensorComponent::Stride2:
+                return var_name + "_stride2";
+            case TensorComponent::Stride3:
+                return var_name + "_stride3";
+            case TensorComponent::Dim0:
+                return var_name + "_dim0";
+            case TensorComponent::Dim1:
+                return var_name + "_dim1";
+            case TensorComponent::Dim2:
+                return var_name + "_dim2";
+            case TensorComponent::Dim3:
+                return var_name + "_dim3";
+            case TensorComponent::Dim1xDim2:
+                return var_name + "_dim1xdim2";
+            case TensorComponent::Dim1xDim2xDim3:
+                return var_name + "_dim1xdim2xdim3";
+            default:
+                std::cout << "Unsupported component" << std::endl;
+                assert(false);
+        }
+
+        return var_name;
+    }
+
+    bool                          _return_by_value_when_possible { false };
+    std::vector<GpuTensorStorage> _storage_required {};
+    std::vector<TensorComponent>  _components_required {};
+};
+
+/**
+ * @brief Data structure that contains the declared tiles by the components.
+ * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
+ * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
+ * and remove (pop) all the tiles from the level above.
+ * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
+ * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
+ * when declaring tiles among different components.
+ *
+ */
+class GpuTileRegistry
+{
+public:
+enum class RegistryTileType
+{
+    Tile,
+    Link
+};
+
+using RegistryIdSpace            = int32_t;
+using RegistryLevel              = int32_t;
+using RegistryTileName           = std::string;
+
+struct RegistryTileTableEntry
+{
+    RegistryLevel              registry_level { 0 };
+    std::unique_ptr<IVectorTile>  tile_object { nullptr };
+};
+
+struct RegistryTileTypeTableEntry
+{
+    RegistryTileType           tile_type { RegistryTileType::Tile };
+    RegistryTileName           tile_name {};
+    RegistryIdSpace            registry_idspace { 0 };
+    RegistryLevel              registry_level { 0 };
+};
+
+using RegistryTileTable      = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
+using RegistryTileTypeTable  = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
+    /**
+     * @brief Construct a new Gpu Tile Registry object
+     *
+     */
+    GpuTileRegistry()
+    {
+        _language = GpuTargetLanguage::Unknown;
+    }
+    /**
+     * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
+     *
+     * @param[in] language Gpu programming language to use
+     */
+    GpuTileRegistry(GpuTargetLanguage language)
+    {
+        _language = language;
+    }
+    /**
+     * @brief Default destructor. Destroy the Gpu Tile Registry object
+     *
+     */
+    ~GpuTileRegistry() = default;
+    /**
+     * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
+     *        Therefore, the IdSpace should be set before declaring any tiles.
+     *
+     * @param[in] id The IdSpace id
+     */
+    void set_IdSpace(int32_t id)
+    {
+        _IdSpace = id;
+    }
+    /**
+     * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
+     *
+     * @return The IdSpace id
+     */
+    int32_t IdSpace() const
+    {
+        return _IdSpace;
+    }
+    /**
+     * @brief Gets all the IdSpace declarations defined in the tile registry.
+     *
+     * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
+     */
+    std::vector<int32_t> IdSpace_declarations() const
+    {
+        std::vector<int32_t> x;
+
+        auto it = _frags.begin();
+
+        while (it != _frags.end())
+        {
+            x.push_back(it->first);
+
+            it++;
+        }
+
+        return x;
+    }
+    /**
+     * @brief Declare a tile from a previously created tile
+     */
+    void insert(const std::string& name, const IVectorTile *frag)
+    {
+        assert(_language == GpuTargetLanguage::OpenCL);
+        const int32_t     key_IdSpace   = _IdSpace;
+        const std::string key_var_name  = name;
+        const std::string var_name      = frag->name();
+        TileInfo        format        = frag->format();
+
+        // First check whether a tile with the same name exists
+        IVectorTile *result = (*this)[key_var_name];
+        assert(result == nullptr);
+        if(result == nullptr)
+        {
+            std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
+
+            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
+            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
+
+            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Link;
+            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
+            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
+            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
+        }
+    }
+    /**
+     * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
+     *
+     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
+     *
+     * @param[in] name   Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
+     * @param[in] format Tile format use to use
+     */
+    void insert(const std::string& name, const TileInfo& format)
+    {
+        assert(_language == GpuTargetLanguage::OpenCL);
+        const int32_t     key_IdSpace   = _IdSpace;
+        const std::string key_var_name  = name;
+        const std::string var_name      = generate_tile_name(name);
+
+        // First check whether a tile with the same name exists
+        IVectorTile *result = (*this)[key_var_name];
+        assert(result == nullptr);
+        if(result == nullptr)
+        {
+            std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
+            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
+            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
+
+            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
+            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
+            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
+            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
+        }
+    }
+    /**
+     * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
+     *
+     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
+     *
+     * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
+     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
+     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
+     *                 that the data type is aligned with the content of the std::string.
+     */
+    void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
+    {
+        assert(_language == GpuTargetLanguage::OpenCL);
+        const int32_t     key_IdSpace  = _IdSpace;
+        const std::string key_var_name = name;
+
+        // First check whether a tile with the same name exists
+        IVectorTile *result = (*this)[key_var_name];
+        assert(result == nullptr);
+        if(result == nullptr)
+        {
+            std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
+            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
+            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
+
+            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
+            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
+            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
+            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
+        }
+    }
+    /**
+     * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
+     *
+     * @note This method can be used to declare temporary tiles that need to be accessed only once.
+     *
+     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
+     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
+     *                 that the data type is aligned with what passed with the std::string.
+     *
+     * @return IVectorTile* the anonymous constant tile
+     */
+    IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
+    {
+        assert(_language == GpuTargetLanguage::OpenCL);
+        const int32_t     key_IdSpace = _IdSpace;
+        const std::string key_var_name  = "_" + std::to_string(_anonymous_frag_count++);
+
+        // First check whether a tile with the same name exists
+        IVectorTile *result = (*this)[key_var_name];
+        assert(result == nullptr);
+        if(result == nullptr)
+        {
+            std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
+            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
+            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
+
+            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
+            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
+            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
+            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
+        }
+
+        return (*this)[key_var_name];
+    }
+    /**
+     * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
+     *
+     * @param[in] name         The name of the tile to retrieve
+     * @param[in] IdSpace The IdSpace id where to search the tile
+     *
+     * @return IVectorTile* The tile
+     */
+    IVectorTile* get(const std::string& name, int32_t IdSpace)
+    {
+        const int32_t     key_IdSpace = IdSpace;
+        const std::string key_var_name  = name;
+
+        IVectorTile* result = nullptr;
+        auto search_IdSpace = _frags.find(key_IdSpace);
+        if(search_IdSpace != _frags.end())
+        {
+            auto search_tile = _frags[key_IdSpace].find(key_var_name);
+            if(search_tile != _frags[key_IdSpace].end())
+            {
+                result = search_tile->second.tile_object.get();
+                assert(result != nullptr);
+            }
+        }
+
+        return result;
+    }
+    /**
+     * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
+     *
+     * @param[in] name The name of the tile to retrieve
+     *
+     * @return IVectorTile* The tile
+     */
+    IVectorTile* operator[](const std::string& name)
+    {
+        return get(name, _IdSpace);
+    }
+    /**
+     * @brief Check whether the tile in the in the IdSpace provided by the user exists
+     *
+     * @param[in] name         Name of the tile to search for
+     * @param[in] IdSpace The IdSpace id where to search the tile
+     *
+     * @return true if the tile exists
+     * @return false if the tile does not exist
+     */
+    bool has_tile(const std::string& name, int32_t IdSpace) const
+    {
+        const int32_t     key_IdSpace = IdSpace;
+        const std::string key_var_name  = name;
+
+        // IVectorTile* result = nullptr;
+        auto search_IdSpace = _frags.find(key_IdSpace);
+
+        return search_IdSpace != _frags.end();
+    }
+    /**
+     * @brief Check whether the tile within the current IdSpace exists
+     *
+     * @param[in] name Name of the tile to search for
+     *
+     * @return true if the tile exists
+     * @return false if the tile does not exist
+     */
+    bool has_tile(const std::string& name) const
+    {
+        return has_tile(name, _IdSpace);
+    }
+    /**
+     * @brief Get all the tiles declared within the IdSpace provided by the user
+     *
+     * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
+     *
+     * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
+     */
+    std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
+    {
+        std::vector<IVectorTile*> tiles;
+
+        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
+
+        while (it != _frag_types[IdSpace].end())
+        {
+            // The following line should be enabled. However, we cannot at this stage
+            // because it used to retrieve the output tile produced by each component.
+            // However, this method should NOT be used to retrieve the output tile
+            //if(it->second.tile_type == RegistryTileType::Tile)
+            {
+                tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
+            }
+            it++;
+        }
+
+        return tiles;
+    }
+    /**
+     * @brief Increase the level of stack.
+     *
+     */
+    void increment_registry_level()
+    {
+        _registry_level++;
+    }
+    /**
+     * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
+     *
+     */
+    void decrement_registry_level()
+    {
+        assert(_registry_level >= 0);
+
+        // Remove all variables in the local scope
+        std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
+
+        while (it != _frags[_IdSpace].end())
+        {
+            if (it->second.registry_level == _registry_level)
+            {
+                it = _frags[_IdSpace].erase(it);
+            }
+            else
+            {
+                it++;
+            }
+        }
+
+        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
+
+        while (it_type != _frag_types[_IdSpace].end())
+        {
+            if (it_type->second.registry_level == _registry_level)
+            {
+                it_type = _frag_types[_IdSpace].erase(it_type);
+            }
+            else
+            {
+                it_type++;
+            }
+        }
+
+        _registry_level--;
+    }
+    /**
+     * @brief Get the level of the stack
+     *
+     */
+    int32_t level() const
+    {
+        return _registry_level;
+    }
+
+private:
+    // This method ensures that the key is unique among different components
+    std::string generate_tile_name(const std::string& name)
+    {
+        assert(_IdSpace >= 0 );
+        if(_registry_level == 0)
+        {
+            return "_G" + std::to_string(_IdSpace) + "_" + name;
+        }
+        else
+        {
+            return name;
+        }
+    }
+    RegistryTileTable     _frags {};
+    RegistryTileTypeTable _frag_types {};
+    RegistryLevel         _registry_level { 0 };
+    RegistryIdSpace       _IdSpace { -1 };
+    int32_t               _anonymous_frag_count { 0 };    // Counter used to create the anonymous tiles
+    GpuTargetLanguage     _language { GpuTargetLanguage::Unknown }; // Gpu programming language
+};
+
+using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
+
+/**
+ * @brief Data structure that contains the tensors consumed by the components.
+ * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
+ * when declaring tensors among different components.
+ *
+ */
+class GpuTensorArgumentRegistry
+{
+public:
+    /**
+     * @brief Construct a new Gpu Tensor Registry object
+     *
+     */
+    GpuTensorArgumentRegistry()
+    {
+        _language = GpuTargetLanguage::Unknown;
+    }
+    /**
+     * @brief Construct a new Gpu Tensor Registry object
+     *
+     * @param[in] language Gpu programming language to use
+     */
+    GpuTensorArgumentRegistry(GpuTargetLanguage language)
+    {
+        _language = language;
+    }
+    /**
+     * @brief Default destructor. Destroy the Gpu Tensor Registry object
+     *
+     */
+    ~GpuTensorArgumentRegistry() = default;
+    /**
+     * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
+     *        Therefore, the IdSpace should be set before declaring any tensors.
+     *
+     * @param[in] id The IdSpace id
+     */
+    void set_IdSpace(int32_t id)
+    {
+        _IdSpace = id;
+    }
+    /**
+     * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
+     *
+     * @return The IdSpace id
+     */
+    int32_t IdSpace() const
+    {
+        return _IdSpace;
+    }
+    /**
+     * @brief Gets all the IdSpace declarations defined in the tensor registry.
+     *
+     * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
+     */
+    std::vector<int32_t> IdSpace_declarations() const
+    {
+        std::vector<int32_t> x;
+
+        auto it = _refs.begin();
+
+        while (it != _refs.end())
+        {
+            x.push_back(it->first);
+
+            it++;
+        }
+
+        return x;
+    }
+    /**
+     * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
+     *
+     * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
+     *
+     * @param[in] name                          Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
+     * @param[in] x                             Pair of tensor info and tensor id
+     * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
+     */
+    void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
+    {
+        assert(_language == GpuTargetLanguage::OpenCL);
+        const int32_t     key_IdSpace    = _IdSpace;
+        const int32_t     tensor_id      = x.id;
+        const std::string key_var_name   = name;
+        const std::string var_name       = generate_tensor_name(name, tensor_id);
+
+        // First, check whether the tensor has already a reference. If so, trigger an assert
+        assert(!has_tensor_argument(name));
+
+        // Check whether a tensor with that tensorID exists
+        auto result = _tensor_arguments.find(tensor_id);
+        if(result == _tensor_arguments.end())
+        {
+            // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
+            std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
+            _tensor_arguments[tensor_id] = std::move(arg);
+        }
+
+        _refs[key_IdSpace][key_var_name] = tensor_id;
+    }
+    /**
+     * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
+     *
+     * @param[in] name The name of the tensor to retrieve
+     *
+     * @return IGpuTensor* The tensor
+     */
+    IGpuTensorArgument* operator[](const std::string& name)
+    {
+        const int32_t     key_IdSpace = _IdSpace;
+        const std::string key_var_name  = name;
+
+        IGpuTensorArgument* result = nullptr;
+        auto search_IdSpace = _refs.find(key_IdSpace);
+        if(search_IdSpace != _refs.end())
+        {
+            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
+
+            if(search_tensor_id != _refs[key_IdSpace].end())
+            {
+                const int32_t tensor_id = search_tensor_id->second;
+                auto search_tensor_argument = _tensor_arguments.find(tensor_id);
+                if(search_tensor_argument != _tensor_arguments.end())
+                {
+                    result = search_tensor_argument->second.get();
+                }
+                assert(result != nullptr);
+            }
+        }
+
+        return result;
+    }
+    /**
+     * @brief Get all the tensors declared in the IdSpace provided by the user
+     *
+     * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
+     */
+    std::vector<IGpuTensorArgument*> tensor_argument_declarations()
+    {
+        std::vector<IGpuTensorArgument*> args;
+
+        auto it = _tensor_arguments.begin();
+
+        while (it != _tensor_arguments.end())
+        {
+            args.push_back(it->second.get());
+            it++;
+        }
+
+        return args;
+    }
+    /**
+     * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
+     *
+     * @param[in] name Name of the tensor argument to search for
+     *
+     * @return true if the tensor argument exists
+     * @return false if the tensor argument does not exist
+     */
+    bool has_tensor_argument(const std::string& name)
+    {
+        const int32_t     key_IdSpace = _IdSpace;
+        const std::string key_var_name  = name;
+
+        auto search_IdSpace = _refs.find(key_IdSpace);
+
+        if(search_IdSpace != _refs.end())
+        {
+            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
+
+            return search_tensor_id != _refs[key_IdSpace].end();
+        }
+        else
+        {
+            return false;
+        }
+    }
+    /**
+     * @brief Check whether the tensor argument is in the the IdSpace provided by the user
+     *
+     * @param[in] name    Name of the tensor argument to search for
+     * @param[in] IdSpace The IdSpace id where to search the tensor argument
+     *
+     * @return true if the tile exists
+     * @return false if the tile does not exist
+     */
+    bool has_tensor_argument(const std::string& name, int32_t IdSpace)
+    {
+        const int32_t     key_IdSpace = IdSpace;
+        const std::string key_var_name  = name;
+
+        auto search_IdSpace = _refs.find(key_IdSpace);
+
+        if(search_IdSpace != _refs.end())
+        {
+            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
+
+            return search_tensor_id != _refs[key_IdSpace].end();
+        }
+        else
+        {
+            return false;
+        }
+    }
+private:
+    // This method ensures that the key is unique among different components
+    std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
+    {
+        assert(tensor_id >= 0 );
+
+        return name + std::to_string(tensor_id);
+    }
+
+    std::map<int32_t, TensorEntry>                    _tensor_arguments {};
+    std::map<int32_t, std::map<std::string, int32_t>> _refs {};
+    int32_t                                           _IdSpace { -1 };
+    GpuTargetLanguage                                 _language { GpuTargetLanguage::Unknown }; // Gpu programming language
+};
+
+enum class OpType : int32_t
+{
+    Elementwise = 0x0000,
+    Relational  = 0x1000,
+    Algebra     = 0x2000
+};
+
+inline std::string to_string(AssignmentOp op)
+{
+    switch(op)
+    {
+        case AssignmentOp::Decrement:
+            return "-=";
+        case AssignmentOp::Increment:
+            return "+=";
+        default:
+            assert(false);
+            return "";
+    }
+}
+
+inline std::string to_string(BinaryOp op)
+{
+    switch(op)
+    {
+        case BinaryOp::Add:
+            return "+";
+        case BinaryOp::Sub:
+            return "-";
+        case BinaryOp::Mul:
+            return "*";
+        case BinaryOp::Div:
+            return "/";
+        case BinaryOp::Mod:
+            return "%";
+        case BinaryOp::Equal:
+            return "==";
+        case BinaryOp::Less:
+            return "<";
+        case BinaryOp::LessEqual:
+            return "<=";
+        case BinaryOp::Greater:
+            return ">";
+        case BinaryOp::GreaterEqual:
+            return ">=";
+        case BinaryOp::LogicalAnd:
+            return "&&";
+        case BinaryOp::LogicalOr:
+            return "||";
+        case BinaryOp::LogicalNot:
+            return "!";
+        default:
+            assert(false);
+            return "";
+    }
+}
+
+inline std::string binary_op_string(BinaryOp op)
+{
+    switch(op)
+    {
+        case BinaryOp::Add:
+            return "add";
+        case BinaryOp::Sub:
+            return "sub";
+        case BinaryOp::Mul:
+            return "mul";
+        case BinaryOp::Div:
+            return "div";
+        case BinaryOp::Mod:
+            return "mod";
+        case BinaryOp::Equal:
+            return "eq";
+        case BinaryOp::Less:
+            return "gt";
+        case BinaryOp::LessEqual:
+            return "gteq";
+        case BinaryOp::Greater:
+            return "lt";
+        case BinaryOp::GreaterEqual:
+            return "lte";
+        default:
+            assert(false);
+            return "";
+    }
+}
+
+enum class OperandType : int32_t
+{
+    Unknown              = 0x00000000,
+    ScalarFp32           = 0x00001011, // Immediate scalar tile
+    ScalarFp16           = 0x00001012, // Immediate scalar tile
+    ScalarInt32          = 0x00001021, // Immediate scalar tile
+    ScalarInt16          = 0x00001022, // Immediate scalar tile
+    ScalarInt8           = 0x00001024, // Immediate scalar tile
+    ScalarUInt32         = 0x00001031, // Immediate scalar tile
+    ScalarUInt16         = 0x00001032, // Immediate scalar tile
+    ScalarUInt8          = 0x00001034, // Immediate scalar tile
+    ScalarBool           = 0x00001041, // Immediate scalar tile
+    ScalarTile           = 0x00001050, // Scalar from a tile
+    Tile                 = 0x00010000, // Tile
+    TensorStride1        = 0x00100001, // Tensor component
+    TensorStride2        = 0x00100002, // Tensor component
+    TensorStride3        = 0x00100003, // Tensor component
+    TensorStride4        = 0x00100004, // Tensor component
+    TensorDim0           = 0x00100010, // Tensor component
+    TensorDim1           = 0x00100020, // Tensor component
+    TensorDim2           = 0x00100030, // Tensor component
+    TensorDim3           = 0x00100040, // Tensor component
+    TensorDim4           = 0x00100050, // Tensor component
+    TensorC              = 0x00100010, // Tensor component
+    TensorW              = 0x00100020, // Tensor component
+    TensorH              = 0x00100030, // Tensor component
+    TensorD              = 0x00100040, // Tensor component
+    TensorN              = 0x00100050, // Tensor component
+    TensorDim1xDim2      = 0x00100100, // Tensor component
+    TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
+    TensorWxH            = 0x00100300, // Tensor component
+    TensorWxHxD          = 0x00100400, // Tensor component
+    TensorDataOffset     = 0x00100500, // Tensor component
+};
+
+struct ScalarTileCoord
+{
+    ScalarTileCoord() {}
+    ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
+    int32_t x { -1 };
+    int32_t y { -1 };
+};
+/**
+ * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
+ * Operand can be of three types:
+ * -# Scalar immediate: constant expression
+ * -# Tile: A tile
+ * -# Tensor component: A component (scalar) of a tensor
+ *
+ */
+class Operand
+{
+public:
+    Operand(const std::string &val)
+    {
+        _str  = val;
+        _type = OperandType::Tile;
+    }
+
+    Operand(const std::string &val, const ScalarTileCoord& coord)
+    {
+        _str   = val;
+        _type  = OperandType::ScalarTile;
+        _coord = coord;
+    }
+
+    Operand(const std::string &val, OperandType type)
+    {
+        _str  = val;
+        _type = type;
+    }
+
+    Operand(const Operand& t)
+    {
+        _str  = t.value();
+        _type = t.type();
+    }
+
+    Operand& operator=(const Operand& t)
+    {
+        _str   = t.value();
+        _type  = t.type();
+        _coord = t.scalar_tile_coordinate();
+        return *this;
+    }
+
+    std::string value() const
+    {
+        return _str;
+    }
+
+    OperandType type() const
+    {
+        return _type;
+    }
+
+    ScalarTileCoord scalar_tile_coordinate() const
+    {
+        return _coord;
+    }
+
+private:
+    std::string     _str {};
+    OperandType     _type { OperandType::Unknown };
+    ScalarTileCoord _coord {};
+};
+
+enum class GpuSamplerTensorStorage : int32_t
+{
+    Unknown          = static_cast<int32_t>(GpuTensorStorage::Unknown),
+    BufferUint8Ptr   = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
+    Image2dReadOnly  = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
+    Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
+    Image3dReadOnly  = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
+    Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
+};
+
+struct GpuSampler
+{
+    GpuSampler() = default;
+    TensorSamplerFormat  format { TensorSamplerFormat::Unknown };
+    GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
+    TensorSamplerAddressModeX  address_mode_x { TensorSamplerAddressModeX::Unknown };
+    TensorSamplerAddressModeY  address_mode_y { TensorSamplerAddressModeY::Unknown };
+    TensorSamplerAddressModeZ  address_mode_z { TensorSamplerAddressModeZ::Unknown };
+};
+
+inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
+{
+    CKW_UNUSED(step_x, step_y, step_z);
+
+    auto tensor = tensor_info_id->shape;
+
+    GpuSampler dst_sampler;
+    dst_sampler.format         = sampler.format;
+    dst_sampler.storage        = GpuSamplerTensorStorage::BufferUint8Ptr;
+    dst_sampler.address_mode_x = sampler.address_mode_x;
+    dst_sampler.address_mode_y = sampler.address_mode_y;
+    dst_sampler.address_mode_z = sampler.address_mode_z;
+
+    int32_t dim_x = 0;
+    int32_t dim_y = 0;
+    int32_t dim_z = 0;
+
+    switch(sampler.format)
+    {
+        case TensorSamplerFormat::C_W_H:
+        dim_x = tensor[0];
+        dim_y = tensor[1];
+        dim_z = tensor[2];
+        break;
+        case TensorSamplerFormat::C_WH_1:
+        dim_x = tensor[0];
+        dim_y = tensor[1] * tensor[2];
+        dim_z = 1;
+        break;
+        default:
+        std::cout << "Unsupported tensor format" << std::endl;
+        assert(false);
+        break;
+    }
+
+    if(dim_x == 1)
+    {
+        assert(step_x == 1);
+        dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
+    }
+
+    if(dim_y == 1)
+    {
+        assert(step_y == 1);
+        dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
+    }
+
+    if(dim_z == 1)
+    {
+        assert(step_z == 1);
+        dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
+    }
+
+    return dst_sampler;
+}
+
+class GpuOutputSampler
+{
+public:
+    GpuOutputSampler() = default;
+    /**
+     * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
+     *        by the root component. Once initialized, all simpler components will need to used this sampler
+     *        or a broadcasted version of it
+     *
+     * @param[in] sampler GpuSampler
+     * @param[in] step_x  Increment step in the X direction. Not necessarily it is the same of n0 of tile!
+     * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
+     * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
+     */
+    void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
+    {
+        assert(_is_initialized == false);
+
+        _step_x         = step_x;
+        _step_y         = step_y;
+        _step_z         = step_z;
+        _tensor_info_id = tensor_info_id;
+        _sampler        = create_sampler(tensor_storage, tensor_format);
+        _is_initialized = true;
+    };
+
+    GpuSampler sampler() const
+    {
+        return _sampler;
+    };
+
+    int32_t step_x() const
+    {
+        return _step_x;
+    };
+
+    int32_t step_y() const
+    {
+        return _step_y;
+    };
+
+    int32_t step_z() const
+    {
+        return _step_z;
+    };
+private:
+    GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
+    {
+        // Output can only be in output mode
+        assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
+        assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
+
+        auto tensor = _tensor_info_id->shape;
+
+        GpuSampler sampler;
+        sampler.format         = tensor_format;
+        sampler.storage        = tensor_storage;
+        sampler.address_mode_x = TensorSamplerAddressModeX::None;
+        sampler.address_mode_y = TensorSamplerAddressModeY::None;
+        sampler.address_mode_z = TensorSamplerAddressModeZ::None;
+
+        // In the case of texture, we do not need any special checks at the border
+        if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
+        {
+            int32_t dim_x = 0;
+            int32_t dim_y = 0;
+            int32_t dim_z = 0;
+
+            switch(tensor_format)
+            {
+                case TensorSamplerFormat::C_W_H:
+                dim_x = tensor[0];
+                dim_y = tensor[1];
+                dim_z = tensor[2];
+                break;
+                case TensorSamplerFormat::C_WH_1:
+                dim_x = tensor[0];
+                dim_y = tensor[1] * tensor[2];
+                dim_z = 1;
+                break;
+                default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                break;
+            }
+
+            if((dim_x % _step_x) != 0 && dim_x != 1)
+            {
+                sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
+            }
+
+            if((dim_y % _step_y) != 0 && dim_y != 1)
+            {
+                sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
+            }
+
+            if((dim_z % _step_z) != 0 && dim_z != 1)
+            {
+                sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
+            }
+        }
+
+        return sampler;
+    }
+    GpuSampler              _sampler { }; // GpuSampler
+    int32_t                 _step_x { 1 };
+    int32_t                 _step_y { 1 };
+    int32_t                 _step_z { 1 };
+    const TensorInfo* _tensor_info_id { nullptr };
+    bool                    _is_initialized { false };
+};
+
+/**
+ * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
+ */
+class TensorOperand
+{
+public:
+    TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
+    {
+    }
+
+    TensorOperand& operator=(const TensorOperand& t)
+    {
+        _str     = t.value();
+        _sampler = t.sampler();
+        return *this;
+    }
+
+    std::string value() const
+    {
+        return _str;
+    }
+
+    GpuSampler sampler() const
+    {
+        return _sampler;
+    }
+
+private:
+    std::string _str {};
+    GpuSampler  _sampler {};
+};
+
+/**
+ * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
+ *        This data structure must be initialized before being passed to the Gpu Kernel Writer
+ *
+ */
+class GpuKernelWriterDataHolder
+{
+public:
+    /**
+     * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
+     *        the GPU target and target specific capabilities (extensions). For now, we just initialize the
+     *        programming language
+     *
+     * @param[in] language Gpu programming language to use
+     */
+    GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
+    {
+    }
+    /**
+     * @brief Get the Gpu programming language used
+     *
+     * @return GpuTargetLanguage the Gpu programming language
+     */
+    GpuTargetLanguage programming_language() const
+    {
+        return _language;
+    }
+    /**
+     * @brief @ref GpuTileRegistry
+     *
+     */
+    GpuTileRegistry tiles{};
+    /**
+     * @brief @ref GpuTensorArgumentRegistry
+     *
+     */
+    GpuTensorArgumentRegistry arguments{};
+    /**
+     * @brief @ref GpuOutputSampler.
+     *
+     */
+    GpuOutputSampler output_sampler{};
+    /**
+     * @brief Source code
+     *
+     */
+    std::string code{};
+
+    // GpuExtensionRegistry extensions{};
+private:
+    GpuTargetLanguage _language;
+};
+
+struct LWS
+{
+    int32_t x {1};
+    int32_t y {1};
+    int32_t z {1};
+};
+
+/**
+ * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
+ *        declare an anonymous tile in the tile registry.
+ */
+class OperandUnpacker
+{
+public:
+    OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
+    {
+        // Increase the level of the stack to allocate possible temporary tiles
+        _tiles.increment_registry_level();
+    };
+
+    ~OperandUnpacker()
+    {
+        // Decrease the level of the stack to deallocate any temporary tiles
+        _tiles.decrement_registry_level();
+    }
+
+    IVectorTile* unpack(const Operand& src)
+    {
+        // Get the tile
+        if(src.type() == OperandType::Tile)
+        {
+            assert(_tiles.has_tile(src.value()));
+            return _tiles[src.value()];
+        }
+        // Create an anonymous tile with a constant
+        else if(static_cast<int32_t>(src.type()) & 0x00001000)
+        {
+            if(src.type() == OperandType::ScalarTile)
+            {
+                ScalarTileCoord coord = src.scalar_tile_coordinate();
+                assert(_tiles.has_tile(src.value()));
+                assert(coord.x >= 0);
+                assert(coord.y >= 0);
+                auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
+                return _tiles.insert({{{val.str}}}, val.type.dt);
+            }
+            else
+            {
+                return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
+            }
+        }
+        // Create an anonymous tile with the tensor component
+        else
+        {
+            assert(_arguments.has_tensor_argument(src.value()));
+            auto x = _arguments[src.value()];
+            const std::string val = x->component(to_tensor_component(src.type()));
+            const DataType    dt  = x->component_data_type();
+            return _tiles.insert({{{val}}}, dt);
+        }
+    }
+
+private:
+    DataType to_tile_data_type(OperandType x)
+    {
+        return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
+    }
+
+    TensorComponent to_tensor_component(OperandType x)
+    {
+        switch(x)
+        {
+            case OperandType::TensorDim0:
+            return TensorComponent::Dim0;
+            case OperandType::TensorDim1:
+            return TensorComponent::Dim1;
+            case OperandType::TensorDim2:
+            return TensorComponent::Dim2;
+            case OperandType::TensorDim3:
+            return TensorComponent::Dim3;
+            case OperandType::TensorDim4:
+            return TensorComponent::Dim4;
+            case OperandType::TensorStride1:
+            return TensorComponent::Stride1;
+            case OperandType::TensorStride2:
+            return TensorComponent::Stride2;
+            case OperandType::TensorStride3:
+            return TensorComponent::Stride3;
+            case OperandType::TensorStride4:
+            return TensorComponent::Stride4;
+            case OperandType::TensorDim1xDim2:
+            return TensorComponent::Dim1xDim2;
+            case OperandType::TensorDim1xDim2xDim3:
+            return TensorComponent::Dim1xDim2xDim3;
+            case OperandType::TensorDataOffset:
+            return TensorComponent::OffsetFirstElement;
+            default:
+            assert(false);
+            return TensorComponent::Unknown;
+        }
+    }
+
+    GpuTileRegistry&       _tiles;
+    GpuTensorArgumentRegistry& _arguments;
+};
+
+/**
+ * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
+ *        declare an anonymous tile in the tile registry.
+ *        Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
+ */
+class TensorOperandUnpacker
+{
+public:
+    TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
+    {
+    };
+
+    IGpuTensorArgument* unpack(const TensorOperand& src)
+    {
+        assert(_arguments.has_tensor_argument(src.value()));
+        return _arguments[src.value()];
+    }
+
+private:
+    GpuTensorArgumentRegistry& _arguments;
+};
+
+/**
+ * @brief The GpuKernel will be used in three occasions (stages):
+ * #- Compilation stage
+ * #- Tuning stage
+ * #- Dispatch stage
+ */
+struct GpuKernel
+{
+    // Compilation stage
+    std::string                                       code {};               // Source code, required for the compilation stage
+    std::vector<GpuExtensions>                        list_extensions{};     // Extensions, required for the compilation stage
+    // Tuning stage
+    std::string                                       config_id {};          // Unique id, required for the tuning stage
+    std::vector<LWS>                                  list_lws{};            // LWS to test, required for the tuning stage
+    // Dispatch stage
+    GpuOutputSampler                                  output_sampler{};      // GpuOutputSampler, required for the dispatch stage
+    std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages;  // List of tensor storages, required for the dispatch stage
+    std::vector<std::pair<int32_t, TensorComponent>>  list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
+};
+
+// This function should produce an object with the source
+inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
+{
+    std::string code;
+    code += "__kernel void ";
+    code += name;
+    code += "(\n";
+
+    auto IdSpaces = in.arguments.IdSpace_declarations();
+
+    std::vector<std::string> arg_str;
+
+    auto tensor_args = in.arguments.tensor_argument_declarations();
+
+    for(auto &i : tensor_args)
+    {
+        // For each tensor used, get the storage and tensor components
+        auto storages   = i->storage_declarations();
+        auto components = i->component_declarations();
+
+        for(auto &y : storages)
+        {
+            std::string str;
+            str += i->storage_type_declaration(y);
+            str += " ";
+            str += i->storage(y);
+            arg_str.push_back(str);
+        }
+
+        for(auto &y : components)
+        {
+            std::string str;
+            str += i->component_type_declaration();
+            str += " ";
+            str += i->component(y);
+            arg_str.push_back(str);
+        }
+    }
+
+    for(size_t i = 0; i < arg_str.size(); ++i)
+    {
+        code += arg_str[i];
+        if(i + 1 < arg_str.size())
+        {
+            code += ",\n";
+        }
+    }
+
+    code += ")\n";
+    code += "{\n";
+    code += in.code;
+    code += "}\n";
+
+    return code;
+}
+
+/**
+ * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
+ * how to reduce the dimensionality of a tensor
+ *
+ */
+class GpuTensor3dMapper
+{
+public:
+    GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
+    {
+    };
+
+    std::string tensor_component_x() const
+    {
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+            case TensorSamplerFormat::C_W_H:
+                return _tensor->component(TensorComponent::C);
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    std::string tensor_component_y() const
+    {
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+                return _tensor->component(TensorComponent::WxH);
+            case TensorSamplerFormat::C_W_H:
+                return _tensor->component(TensorComponent::W);
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    std::string tensor_component_z() const
+    {
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+                return "1";
+            case TensorSamplerFormat::C_W_H:
+                return _tensor->component(TensorComponent::H);
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    std::string tensor_component_stride_y() const
+    {
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+            case TensorSamplerFormat::C_W_H:
+                return _tensor->component(TensorComponent::Stride1);
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    std::string tensor_component_stride_z() const
+    {
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+                return "0";
+            case TensorSamplerFormat::C_W_H:
+                return _tensor->component(TensorComponent::Stride2);
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    std::string tensor_component_stride_batch() const
+    {
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+            case TensorSamplerFormat::C_W_H:
+                return _tensor->component(TensorComponent::Stride3);
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    bool is_one_component_x() const
+    {
+        auto t = _tensor->format();
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+            case TensorSamplerFormat::C_W_H:
+                return t.shape[0] == 1;
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    bool is_one_component_y() const
+    {
+        auto t = _tensor->format();
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+                return (t.shape[1] * t.shape[2]) == 1;
+            case TensorSamplerFormat::C_W_H:
+                return t.shape[1] == 1;
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    bool is_one_component_z() const
+    {
+        auto t = _tensor->format();
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+                return true;
+            case TensorSamplerFormat::C_W_H:
+                return t.shape[2] == 1;
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    bool is_one_component_batch() const
+    {
+        auto t = _tensor->format();
+        const auto format = _sampler.format;
+        switch(format)
+        {
+            case TensorSamplerFormat::C_WH_1:
+            case TensorSamplerFormat::C_W_H:
+                return t.shape[3] == 1;
+            default:
+                std::cout << "Unsupported tensor format" << std::endl;
+                assert(false);
+                return "";
+        }
+    }
+
+    GpuSampler gpu_sampler() const
+    {
+        return _sampler;
+    }
+
+    IGpuTensorArgument* tensor_argument() const
+    {
+        return _tensor;
+    }
+
+private:
+    GpuSampler          _sampler;
+    IGpuTensorArgument* _tensor;
+};
+
+struct GpuKernelWriterAttribute
+{
+    bool return_tensor_component_by_value { false };
+};
+
+enum class ConvertPolicy
+{
+    Wrap,    /**< Wrap around */
+    Saturate /**< Saturate */
+};
+
+enum class RoundingMode
+{
+    None,
+    Rte,
+    Rtz,
+    Rtp,
+    Rtn
+};
+
+// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
+class IGpuKernelWriter
+{
+public:
+    virtual ~IGpuKernelWriter() = default;
+    virtual void set_IdSpace(int32_t id) = 0;
+    virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
+    virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
+    virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
+    virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
+    virtual void write_text(const std::string& x) = 0;
+    virtual void compound_statement_begin() = 0;
+    virtual void compound_statement_end() = 0;
+
+    // Operations
+    virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
+    virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
+    virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
+    virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
+    virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
+    virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
+    virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
+    virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
+    virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
+    virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
+    virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
+    virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
+    virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
+    virtual void op_return() = 0;
+    // virtual void op_else() = 0;
+    // virtual void op_elseif() = 0;
+    // Utils
+    // It is the process of converting
+    virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
+};
+
+enum class GpuLoadStoreType
+{
+    Load  = 1,
+    Store = 2
+};
+
+class IGpuLoadStoreHelperWriter
+{
+public:
+    IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
+    IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
+    IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
+    virtual ~IGpuLoadStoreHelperWriter() = default;
+    virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
+    virtual void write(const std::pair<int32_t, std::string>& y) = 0;
+    virtual void finalize() = 0;
+protected:
+    IGpuKernelWriter* _writer;
+    GpuTensor3dMapper _mapper;
+    GpuLoadStoreType  _type;
+};
+
+class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
+{
+public:
+    ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
+    {
+    }
+
+    ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
+    ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
+
+    static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
+    {
+        CKW_UNUSED(x, type, dst);
+
+        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
+        {
+            return false;
+        }
+        return true;
+    }
+
+    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
+    {
+        assert(validate(_writer, _mapper, _type, dst));
+
+        _dst           = dst;
+        _ls_width_full = dst->format().w;
+
+        _coord_x = x->scalar(0, 0).str;
+        _coord_z = z->scalar(0, 0).str;
+        _coord_b = b->scalar(0, 0).str;
+        _coord_orig_z = _coord_z;
+
+        out_of_bound_initialize_x(_coord_x);
+        out_of_bound_initialize_z(_coord_z);
+
+        /*
+        meaning of else:
+        - x: partial load/store
+        - y: no load/store operation
+        - z: no load/store operation
+        if(x)
+        {
+            if(z)
+            {
+                if(y)
+                {
+                    // full load/store width
+                }
+                else
+                {
+                    // no load/store
+                }
+            }
+            else
+            {
+                // no load/store
+            }
+        }
+        else
+        {
+            if(z)
+            {
+                if(y)
+                {
+                    // partial load/store width
+                }
+                else
+                {
+                    // no load/store
+                }
+            }
+            else
+            {
+                // no load/store
+            }
+        }
+        */
+    }
+
+    void write(const std::pair<int32_t, std::string>& y) override
+    {
+        int32_t     idx_y   = y.first;
+        std::string coord_y = y.second;
+
+        // The only check required is on Y.
+        out_of_bound_initialize_y(coord_y);
+
+        const std::string dst     = _dst->vector(idx_y).str;
+        const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
+        const std::string ls_buf  = to_ls_buffer(_type, _ls_width_full, dst, address);
+
+        _writer->write_text(ls_buf);
+        _writer->write_text(";\n");
+
+        out_of_bound_finalize_y(dst);
+
+        // The left over load/store will be written in the finalize stage
+        if(_ls_width_part.size() != 0)
+        {
+            int32_t w = 0;
+            for(auto &p : _ls_width_part)
+            {
+                const std::string dst0    = _dst->vector(w, p, idx_y).str;
+                const std::string coord_x = _coord_x + " + " + std::to_string(w);
+                const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
+                const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
+                _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
+
+                w += p;
+            }
+        }
+    }
+
+    void finalize() override
+    {
+        out_of_bound_finalize_z();
+        out_of_bound_finalize_x();
+    }
+private:
+    IVectorTile*            _dst { nullptr };
+    int32_t                  _ls_width_full { 0 };
+    std::vector<int32_t>     _ls_width_part { };
+    std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
+    std::string              _coord_x {};
+    std::string              _coord_z {};
+    std::string              _coord_orig_z {};
+    std::string              _coord_b {};
+
+    void out_of_bound_initialize_x(std::string& coord)
+    {
+        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+        {
+            auto tensor_format = _mapper.tensor_argument()->format();
+            auto shape         = tensor_format.shape;
+
+            _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
+            if(_ls_width_part.size() != 0)
+            {
+                _writer->write_text("if(" + coord + " > 0)\n");
+                _writer->compound_statement_begin();
+            }
+        }
+    };
+
+    void out_of_bound_finalize_x()
+    {
+        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+        {
+            if(_ls_width_part.size() != 0)
+            {
+                _writer->compound_statement_end();
+                _writer->write_text("else\n");
+                _writer->compound_statement_begin();
+
+                out_of_bound_initialize_z(_coord_orig_z);
+                for(auto &i : _leftovers_x)
+                {
+                    out_of_bound_initialize_y(i.first.second);
+                    _writer->write_text(i.second);
+                    _writer->write_text(";\n");
+                    out_of_bound_finalize_y(i.first.first);
+                }
+                out_of_bound_finalize_z();
+                _writer->compound_statement_end();
+            }
+        }
+    };
+
+    void out_of_bound_initialize_y(std::string& coord)
+    {
+        std::string max = "";
+
+        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
+
+        switch(address_mode_y)
+        {
+            case TensorSamplerAddressModeY::Skip:
+            case TensorSamplerAddressModeY::ClampToBorder:
+            // NOTE: This line should not be moved outside of the switch statement.
+            // The reason for that is because when we query the component, the component is marked as used
+            // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
+            // we should request the component only when used
+            max = _mapper.tensor_component_y();
+            _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
+            _writer->write_text("if(" + coord + " >= 0)\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
+            max = _mapper.tensor_component_y();
+            _writer->write_text("if(" + coord + " < " + max + ")\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeY::ClampToNearest:
+            max = _mapper.tensor_component_y();
+            coord = "clamp(" + coord + ", 0, " + max + " - 1)";
+            break;
+            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
+            max = _mapper.tensor_component_y();
+            coord = "min(" + coord + ", " + max + " - 1)";
+            break;
+            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
+            coord = "max(" + coord + ", 0)";
+            break;
+            case TensorSamplerAddressModeY::None:
+            break;
+            default:
+            std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
+            assert(false);
+        }
+    };
+
+    void out_of_bound_finalize_y(const std::string& dst)
+    {
+        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
+
+        switch(address_mode_y)
+        {
+            case TensorSamplerAddressModeY::ClampToBorder:
+            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
+            case TensorSamplerAddressModeY::Skip:
+            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
+            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
+            _writer->compound_statement_end();
+            break;
+
+            default:
+                assert(false);
+        }
+
+        switch(address_mode_y)
+        {
+            case TensorSamplerAddressModeY::ClampToBorder:
+            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
+            _writer->write_text("else\n");
+            _writer->compound_statement_begin();
+            _writer->write_text(dst);
+            _writer->write_text(" = 0.0f;\n");
+            _writer->compound_statement_end();
+            break;
+
+            default:
+                assert(false);
+        }
+    };
+
+    void out_of_bound_initialize_z(std::string& coord)
+    {
+        std::string max = "";
+
+        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
+
+        switch(address_mode_z)
+        {
+            case TensorSamplerAddressModeZ::Skip:
+            max = _mapper.tensor_component_z();
+            _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
+            _writer->write_text("if(" + coord + " >= 0)\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
+            max = _mapper.tensor_component_z();
+            _writer->write_text("if(" + coord + " < " + max + ")\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeZ::ClampToNearest:
+            max = _mapper.tensor_component_z();
+            coord = "clamp(" + coord + ", 0, " + max + " - 1)";
+            break;
+            case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
+            max = _mapper.tensor_component_z();
+            coord = "min(" + coord + ", " + max + " - 1)";
+            break;
+            case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
+            coord = "max(" + coord + ", 0)";
+            break;
+            case TensorSamplerAddressModeZ::None:
+            break;
+            default:
+            std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
+            assert(false);
+        }
+    };
+
+    void out_of_bound_finalize_z()
+    {
+        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
+
+        switch(address_mode_z)
+        {
+            case TensorSamplerAddressModeZ::Skip:
+            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
+            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
+            _writer->compound_statement_end();
+            break;
+
+            default:
+                assert(false);
+        }
+    };
+
+    std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
+    {
+        std::vector<int32_t> x;
+
+        switch(ls_leftover_vector_width)
+        {
+            case 0:
+                break;
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 8:
+            case 16:
+                x.push_back(ls_leftover_vector_width);
+                break;
+            case 5:
+                x.push_back(4);
+                x.push_back(1);
+                break;
+            case 6:
+                x.push_back(4);
+                x.push_back(2);
+                break;
+            case 7:
+                x.push_back(4);
+                x.push_back(3);
+                break;
+            case 9:
+                x.push_back(8);
+                x.push_back(1);
+                break;
+            case 10:
+                x.push_back(8);
+                x.push_back(2);
+                break;
+            case 11:
+                x.push_back(8);
+                x.push_back(3);
+                break;
+            case 12:
+                x.push_back(8);
+                x.push_back(4);
+                break;
+            case 13:
+                x.push_back(8);
+                x.push_back(4);
+                x.push_back(1);
+                break;
+            case 14:
+                x.push_back(8);
+                x.push_back(4);
+                x.push_back(2);
+                break;
+            case 15:
+                x.push_back(8);
+                x.push_back(4);
+                x.push_back(3);
+                break;
+
+            default:
+                assert(false);
+        }
+        return x;
+    }
+
+    std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
+    {
+        switch(type)
+        {
+            case GpuLoadStoreType::Load:
+            if(vector_width != 1)
+            {
+                return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
+            }
+            else
+            {
+                return data + " = *(" + address + ")";
+            }
+            break;
+            case GpuLoadStoreType::Store:
+            if(vector_width != 1)
+            {
+                return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
+            }
+            else
+            {
+                return "*(" + address + ") = " + data;
+            }
+            break;
+            default:
+            std::cout << "Unsupported GpuLoadStoreType" << std::endl;
+            assert(false);
+            return "";
+        }
+    }
+
+    std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
+    {
+        auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
+        assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
+        const std::string ptr_buf      = _mapper.tensor_argument()->storage(tensor_storage);
+        const std::string dst_type     = get_cl_data_type(_dst->format().dt, 1);
+
+        std::string address;
+        address += "(__global ";
+        address += dst_type;
+        address += "*)(";
+        address += ptr_buf;
+        if(x != "0" && (_mapper.is_one_component_x() != true))
+        {
+            address += " + (";
+            address += x + ") * sizeof(" + dst_type + ")";
+        }
+        if(y != "0" && (_mapper.is_one_component_y() != true))
+        {
+            const std::string stride_y = _mapper.tensor_component_stride_y();
+            address += " + (";
+            address += y + ")";
+            address += " * ";
+            address += stride_y;
+        }
+        if(z != "0" && (_mapper.is_one_component_z() != true))
+        {
+            const std::string stride_z = _mapper.tensor_component_stride_z();
+            address += " + (";
+            address += z + ")";
+            address += " * ";
+            address += stride_z;
+        }
+        if(b != "0" && (_mapper.is_one_component_batch() != true))
+        {
+            const std::string stride_b = _mapper.tensor_component_stride_batch();
+            address += " + (";
+            address += b + ")";
+            address += " * ";
+            address += stride_b;
+        }
+        address += ")";
+        return address;
+    }
+};
+
+class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
+{
+public:
+    static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile *dst)
+    {
+        CKW_UNUSED(x);
+
+        if(dst->format().w != 4)
+        {
+            return false;
+        }
+        if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
+        {
+            return false;
+        }
+        if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
+        {
+            return false;
+        }
+        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
+        {
+            return false;
+        }
+        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
+        {
+            return false;
+        }
+        if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
+        {
+            return false;
+        }
+        return true;
+        /*
+        - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
+        - z: Only GpuSamplerAddressModeZ::None is supported
+        */
+    }
+    ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
+    {
+    }
+
+    ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
+    ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
+
+    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
+    {
+        assert(validate(_writer, _mapper, _type, dst));
+
+        _dst           = dst;
+        _ls_width_full = dst->format().w;
+        _coord_x       = x->scalar(0, 0).str;
+        _coord_z       = z->scalar(0, 0).str;
+        _coord_b       = b->scalar(0, 0).str;
+
+        /*
+        if(y)
+        {
+            // full load/store width
+        }
+        else
+        {
+            // no load/store
+        }
+        */
+    }
+
+    void write(const std::pair<int32_t, std::string>& y) override
+    {
+        int32_t     idx_y   = y.first;
+        std::string coord_y = y.second;
+
+        // The only check required is on Y.
+        out_of_bound_initialize_y(coord_y);
+
+        const std::string dst     = _dst->vector(idx_y).str;
+        const std::string sampler = to_ls_image2d_sampler();
+        const std::string coord   = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
+        const std::string ls_buf  = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
+
+        _writer->write_text(ls_buf);
+        _writer->write_text(";\n");
+
+        out_of_bound_finalize_y(dst);
+    }
+
+    void finalize() override
+    {
+    }
+private:
+    IVectorTile*   _dst { nullptr };
+    int32_t     _ls_width_full { 0 };
+    std::string _coord_x {};
+    std::string _coord_z {};
+    std::string _coord_b {};
+
+    void out_of_bound_initialize_y(std::string& coord)
+    {
+        std::string max = "";
+
+        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
+
+        switch(address_mode_y)
+        {
+            case TensorSamplerAddressModeY::Skip:
+            max = _mapper.tensor_component_y();
+            _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
+            _writer->write_text("if(" + coord + " >= 0)\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
+            max = _mapper.tensor_component_y();
+            _writer->write_text("if(" + coord + " < " + max + ")\n");
+            _writer->compound_statement_begin();
+            break;
+            case TensorSamplerAddressModeY::ClampToBorder:
+            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToNearest:
+            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
+            case TensorSamplerAddressModeY::None:
+            break;
+            default:
+            std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
+            assert(false);
+        }
+    };
+
+    void out_of_bound_finalize_y(const std::string& dst)
+    {
+        CKW_UNUSED(dst);
+
+        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
+
+        switch(address_mode_y)
+        {
+            case TensorSamplerAddressModeY::Skip:
+            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
+            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
+            _writer->compound_statement_end();
+            break;
+
+            default:
+                assert(false);
+        }
+    };
+
+    std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
+    {
+        CKW_UNUSED(vector_width);
+
+        auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
+        const std::string image2d_obj  = _mapper.tensor_argument()->storage(tensor_storage);
+        // const DataType dt              = _dst->format().dt;
+        const std::string post_fix     = _dst->format().dt == DataType::Fp32? "f" : "h";
+
+        switch(type)
+        {
+            case GpuLoadStoreType::Load:
+            return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
+            break;
+            case GpuLoadStoreType::Store:
+            return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
+            default:
+            assert(false);
+            std::cout << "Unsupported GpuLoadStoreType" << std::endl;
+            assert(false);
+            return "";
+        }
+    }
+
+    std::string to_ls_image2d_sampler() const
+    {
+        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
+
+        switch(address_mode_y)
+        {
+            case TensorSamplerAddressModeY::None:
+            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
+            case TensorSamplerAddressModeY::Skip:
+            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
+            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorder:
+            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
+            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
+            case TensorSamplerAddressModeY::ClampToNearest:
+            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
+            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
+            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
+            default:
+            std::cout << "Unsupported address_mode_coord" << std::endl;
+            assert(false);
+            return "";
+        }
+    }
+
+    std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
+    {
+        std::string coord_x = "(" + x + ") >> 2";
+        std::string coord_y = "(";
+
+        if(y != "0" && (_mapper.is_one_component_y() != true))
+        {
+            coord_y += y;
+        }
+        if(z != "0" && (_mapper.is_one_component_z() != true))
+        {
+            const std::string dim = _mapper.tensor_component_y();
+            coord_y += " + (";
+            coord_y += z + ")";
+            coord_y += " * ";
+            coord_y += dim;
+        }
+        if(b != "0" && (_mapper.is_one_component_batch() != true))
+        {
+            const std::string dim0 = _mapper.tensor_component_y();
+            const std::string dim1 = _mapper.tensor_component_z();
+            coord_y += " + (";
+            coord_y += b + ")";
+            coord_y += " * ";
+            coord_y += dim0;
+            coord_y += " * ";
+            coord_y += dim1;
+        }
+        coord_y += ")";
+        return "(int2)(" + coord_x + ", " + coord_y + ")";
+    }
+};
+
+/** IGpuLoadStoreHelperWriter factory class */
+class ClLoadStoreHelperWriterFactory final
+{
+public:
+    /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
+     *
+     *
+     * @return IGpuLoadStoreHelperWriter
+     */
+    static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
+    {
+        const auto tensor_storage = mapper.gpu_sampler().storage;
+        switch(tensor_storage)
+        {
+            case GpuSamplerTensorStorage::BufferUint8Ptr:
+                return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
+            case GpuSamplerTensorStorage::Image2dReadOnly:
+            case GpuSamplerTensorStorage::Image2dWriteOnly:
+                return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
+            default:
+                std::cout << "Unsupported Gpu tensor storage" << std::endl;
+                assert(false);
+                return nullptr;
+        }
+    }
+};
+
+// This utility method needs to go in utils.h
+inline bool is_tile_scalar(IVectorTile* x)
+{
+    return x->format().w == 1 && x->format().h == 1;
+}
+
+class ClKernelWriter : public IGpuKernelWriter
+{
+public:
+    ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
+    {
+        _data = x;
+        _attr = attr;
+    }
+
+    ClKernelWriter(const ClKernelWriter &) = default;
+    ClKernelWriter &operator=(const ClKernelWriter &) = default;
+
+    // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
+    // there are no conflicts or ambiguity in the code
+    void set_IdSpace(int32_t id) override
+    {
+        _data->tiles.set_IdSpace(id);
+        _data->arguments.set_IdSpace(id);
+    }
+
+    void import_tile(const std::string& dst_name, const IVectorTile *src) override
+    {
+        _data->tiles.insert(dst_name, src);
+    }
+
+    void declare_argument(const std::string& name, const TensorInfo& tensor) override
+    {
+        assert(_data->arguments[name] == nullptr);
+        _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
+    }
+
+    void declare_tile(const std::string& name, const TileInfo& format) override
+    {
+        assert(_data->tiles[name] == nullptr);
+        _data->tiles.insert(name, format);
+
+        IVectorTile *x = _data->tiles[name];
+
+        for(auto &t : x->underlying_source_variables())
+        {
+            _data->code += t.type.str + " " + t.str + ";\n";
+        }
+    }
+
+    void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
+    {
+        assert(_data->tiles[name] == nullptr);
+        _data->tiles.insert(name, in, dt);
+        // Note: A constant does not need to be declared in the code
+    }
+
+    void write_text(const std::string& x) override
+    {
+        _data->code += x;
+    }
+
+    void compound_statement_begin() override
+    {
+        _data->tiles.increment_registry_level();
+        _data->code += "{\n";
+    }
+
+    void compound_statement_end() override
+    {
+        _data->tiles.decrement_registry_level();
+        _data->code += "}\n";
+    }
+
+    void op_get_global_id(const Operand& dst_var, int32_t dim) override
+    {
+        assert(dst_var.type() == OperandType::Tile);
+        assert(_data->tiles.has_tile(dst_var.value()));
+        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
+               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
+
+        auto var = _data->tiles[dst_var.value()];
+
+        _data->code += var->scalar(0, 0).str;
+        _data->code += " = get_global_id(";
+        _data->code += std::to_string(dim);
+        _data->code += ");\n";
+    };
+
+    void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto dst  = operands.unpack(o_dst);
+        auto step = operands.unpack(o_step);
+
+        // Validation: Check that x, y and z are scalar
+
+        TensorOperandUnpacker tensor_operands(_data->arguments);
+        auto tensor      = tensor_operands.unpack(o_tensor);
+        auto gpu_sampler = o_tensor.sampler();
+
+        GpuTensor3dMapper mapper(tensor, gpu_sampler);
+
+        switch (dim)
+        {
+        case 0:
+            if(mapper.is_one_component_x())
+            {
+                _data->code += dst->scalar(0, 0).str;
+                _data->code += " = 0;\n";
+            }
+            else
+            {
+                if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
+                {
+                    // Validation: Check: fixed tensor shape
+                    // TO BE CHANGED
+                    _data->code += dst->scalar(0, 0).str;
+                    _data->code += " = get_global_id(0) * ";
+                    _data->code += step->scalar(0, 0).str;
+                    _data->code += ";\n";
+                }
+                else
+                {
+                    _data->code += dst->scalar(0, 0).str;
+                    _data->code += " = get_global_id(0) * ";
+                    _data->code += step->scalar(0, 0).str;
+                    _data->code += ";\n";
+                }
+            }
+            break;
+        case 1:
+            if(mapper.is_one_component_y())
+            {
+                _data->code += dst->scalar(0, 0).str;
+                _data->code += " = 0;\n";
+            }
+            else
+            {
+                if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
+                {
+
+                }
+                else
+                {
+                    _data->code += dst->scalar(0, 0).str;
+                    _data->code += " = get_global_id(1) * ";
+                    _data->code += step->scalar(0, 0).str;
+                    _data->code += ";\n";
+                }
+            }
+            break;
+        case 2:
+            if(mapper.is_one_component_z())
+            {
+                _data->code += dst->scalar(0, 0).str;
+                _data->code += " = 0;\n";
+            }
+            else
+            {
+                _data->code += dst->scalar(0, 0).str;
+                _data->code += " = get_global_id(2) * ";
+                _data->code += step->scalar(0, 0).str;
+                _data->code += ";\n";
+            }
+            break;
+        default:
+            break;
+        }
+    };
+
+    void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto dst  = operands.unpack(o_dst);
+
+        TensorOperandUnpacker tensor_operands(_data->arguments);
+        auto tensor      = tensor_operands.unpack(o_tensor);
+        auto gpu_sampler = o_tensor.sampler();
+
+        GpuTensor3dMapper mapper(tensor, gpu_sampler);
+
+        if(mapper.is_one_component_batch())
+        {
+            _data->code += dst->scalar(0, 0).str;
+            _data->code += " = 0;\n";
+        }
+        else
+        {
+            std::cout << "Unsupported batched computation" << std::endl;
+            assert(false);
+        }
+    };
+
+    void op_get_global_size(const Operand& dst_var, int32_t dim) override
+    {
+        assert(dst_var.type() == OperandType::Tile);
+        assert(_data->tiles.has_tile(dst_var.value()));
+        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
+               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
+
+        auto var = _data->tiles[dst_var.value()];
+
+        _data->code += var->scalar(0, 0).str;
+        _data->code += " = get_global_size(";
+        _data->code += std::to_string(dim);
+        _data->code += ");\n";
+    }
+
+    void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto lhs = operands.unpack(lhs_name);
+        auto rhs = operands.unpack(rhs_name);
+        auto dst = operands.unpack(dst_name);
+
+        const int32_t dst_w = dst->format().w;
+        const int32_t dst_h = dst->format().h;
+        assert(lhs != nullptr);
+        const int32_t lhs_w = lhs->format().w;
+        const int32_t rhs_w = rhs->format().w;
+
+        if(op == BinaryOp::MatMul_Nt_T)
+        {
+            assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
+            for(int32_t y = 0; y < dst_h; ++y)
+            {
+                for(int32_t x = 0; x < dst_w; ++x)
+                {
+                    for(int32_t k = 0; k < lhs_w; ++k)
+                    {
+                        _data->code += dst->scalar(x, y).str;
+                        _data->code += " = fma(";
+                        _data->code += lhs->scalar(k, y).str;
+                        _data->code += ", ";
+                        _data->code += rhs->scalar(k, x).str;
+                        _data->code += ", ";
+                        _data->code += dst->scalar(x, y).str;
+                        _data->code += ");\n";
+                    }
+                }
+            }
+
+            return;
+        }
+
+        bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
+        bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
+
+        std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
+        std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
+        std::string op_str     = to_string(op);
+
+        // Broadcasting on Y is automatic
+        for(int32_t y = 0; y < dst_h; ++y)
+        {
+            _data->code += dst->vector(y).str;
+            _data->code += " = ";
+            _data->code += lhs_prefix + lhs->vector(y).str;
+            _data->code += " ";
+            _data->code += op_str;
+            _data->code += " ";
+            _data->code += rhs_prefix + rhs->vector(y).str;
+            _data->code += ";\n";
+        }
+    };
+
+    void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
+    {
+        CKW_UNUSED(policy);
+
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto src = operands.unpack(o_src);
+        auto dst = operands.unpack(o_dst);
+
+        // const int32_t dst_w  = dst->format().w;
+        const int32_t dst_h  = dst->format().h;
+        const std::string dt = dst->scalar(0, 0).type.str;
+
+        // Broadcasting on Y is automatic
+        for(int32_t y = 0; y < dst_h; ++y)
+        {
+            _data->code += dst->vector(y).str;
+            _data->code += " = convert_" + dt + "(";
+            _data->code += src->vector(y).str;
+            _data->code += ");\n";
+        }
+    };
+
+    void op_assign(const Operand& dst_name, const Operand& src_name) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto src = operands.unpack(src_name);
+        auto dst = operands.unpack(dst_name);
+
+        const int32_t dst_w  = dst->format().w;
+        const int32_t dst_h  = dst->format().h;
+        const int32_t src_w  = src->format().w;
+        // const int32_t src_h  = src->format().h;
+        const std::string dt = dst->scalar(0, 0).type.str;
+
+        bool broadcast_src_x = dst_w != 1 && src_w == 1;
+
+        std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
+
+        // Broadcasting on Y is automatic
+        for(int32_t y = 0; y < dst_h; ++y)
+        {
+            _data->code += dst->vector(y).str;
+            _data->code += " = ";
+            _data->code += src_prefix + src->vector(y).str;
+            _data->code += ";\n";
+        }
+    }
+
+    void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto src = operands.unpack(src_name);
+        auto dst = operands.unpack(dst_name);
+
+        const int32_t dst_w  = dst->format().w;
+        const int32_t dst_h  = dst->format().h;
+        const int32_t src_w  = src->format().w;
+        // const int32_t src_h  = src->format().h;
+        const std::string dt = dst->scalar(0, 0).type.str;
+
+        bool broadcast_src_x = dst_w != 1 && src_w == 1;
+
+        std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
+
+        // Broadcasting on Y is automatic
+        for(int32_t y = 0; y < dst_h; ++y)
+        {
+            _data->code += dst->vector(y).str;
+            _data->code += " = ";
+
+            switch(func)
+            {
+                case ScalarUnaryFunction::Exp:
+                    _data->code += "exp(";
+                    break;
+
+                default:
+                    CKW_ASSERT(false);
+            }
+
+            _data->code += src_prefix + src->vector(y).str;
+            _data->code += ");\n";
+        }
+    }
+
+    void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto lhs = operands.unpack(o_lhs);
+        auto rhs = operands.unpack(o_rhs);
+
+        assert(is_tile_scalar(lhs));
+        assert(is_tile_scalar(rhs));
+
+        _data->code += "if(";
+        _data->code += lhs->scalar(0, 0).str;
+        _data->code += " ";
+        _data->code += to_string(op);
+        _data->code += " ";
+        _data->code += rhs->scalar(0, 0).str;
+        _data->code += ")\n";
+    }
+
+    void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto var          = operands.unpack(var_name);
+        auto cond_value   = operands.unpack(cond_value_name);
+        auto update_value = operands.unpack(update_value_name);
+
+        const int32_t dst_w = var->format().w;
+        const int32_t dst_h = var->format().h;
+
+        // It must be a scalar variable
+        CKW_UNUSED(dst_w, dst_h);
+        assert(dst_w == 1);
+        assert(dst_h == 1);
+
+        _data->code += "for(; " ;
+        _data->code += var->scalar(0, 0).str;
+        _data->code += " ";
+        _data->code += to_string(cond_op);
+        _data->code += " " + cond_value->scalar(0, 0).str + "; ";
+        _data->code += var->scalar(0, 0).str;
+        _data->code += " ";
+        _data->code += to_string(update_op);
+        _data->code += " " + update_value->scalar(0, 0).str + ")";
+        _data->code += "\n";
+    }
+
+    void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto dst      = operands.unpack(o_dst);
+        auto x        = operands.unpack(o_x);
+        auto y        = operands.unpack(o_y);
+        auto z        = operands.unpack(o_z);
+        auto dil_y    = operands.unpack(dilation_y);
+        auto b        = operands.unpack(o_batch_idx);
+
+        TensorOperandUnpacker tensor_operands(_data->arguments);
+        auto tensor      = tensor_operands.unpack(o_tensor);
+        auto gpu_sampler = o_tensor.sampler();
+
+        GpuTensor3dMapper mapper(tensor, gpu_sampler);
+
+        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
+
+        // Initialize the constant part
+        load_writer->initialize(dst, x, z, b);
+
+        for(int i = 0; i < dst->format().h; ++i)
+        {
+            std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
+            if(dil_y->scalar(0, 0).str != "1")
+            {
+                coord_y += " * " + dil_y->scalar(0, 0).str;
+            }
+            load_writer->write(std::make_pair(i, coord_y));
+        }
+
+        load_writer->finalize();
+    }
+
+    void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto dst      = operands.unpack(o_dst);
+        auto x        = operands.unpack(o_x);
+        auto y_ind    = operands.unpack(o_indirect_h);
+        auto z        = operands.unpack(o_z);
+        auto b        = operands.unpack(o_batch_idx);
+
+        TensorOperandUnpacker tensor_operands(_data->arguments);
+        auto tensor      = tensor_operands.unpack(o_tensor);
+        auto gpu_sampler = o_tensor.sampler();
+
+        GpuTensor3dMapper mapper(tensor, gpu_sampler);
+
+        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
+
+        // Initialize the constant part
+        load_writer->initialize(dst, x, z, b);
+
+        for(int i = 0; i < dst->format().h; ++i)
+        {
+            load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
+        }
+
+        load_writer->finalize();
+    }
+
+    void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto src      = operands.unpack(src_name);
+        auto x        = operands.unpack(x_name);
+        auto y        = operands.unpack(y_name);
+        auto z        = operands.unpack(z_name);
+        auto b        = operands.unpack(batch_index_name);
+
+        TensorOperandUnpacker tensor_operands(_data->arguments);
+        auto tensor      = tensor_operands.unpack(tensor_name);
+        auto gpu_sampler = tensor_name.sampler();
+
+        GpuTensor3dMapper mapper(tensor, gpu_sampler);
+
+        auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
+
+        // Initialize the constant part
+        store_writer->initialize(src, x, z, b);
+
+        int32_t tile_h = src->format().h;
+
+        for(int m0 = tile_h - 1; m0 >= 0; m0--)
+        {
+            store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
+        }
+
+        store_writer->finalize();
+    }
+
+    void op_return() override
+    {
+        _data->code += "return;\n";
+    }
+
+    void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
+    {
+        OperandUnpacker operands(_data->tiles, _data->arguments);
+        auto dst    = operands.unpack(o_dst);
+        auto x      = operands.unpack(o_x);
+        auto y      = operands.unpack(o_y);
+        auto x_off  = operands.unpack(o_x_off);
+        auto y_off  = operands.unpack(o_y_off);
+
+        TensorOperandUnpacker tensor_operands(_data->arguments);
+        auto tensor   = tensor_operands.unpack(o_tensor);
+
+        assert(dst->format().w == 1);
+        assert(x->format().w == 1);
+        assert(y->format().w == 1);
+        assert(x_off->format().w == 1);
+        assert(y_off->format().w == 1);
+        assert(dst->format().dt == DataType::Int32);
+        assert(x->format().dt == DataType::Int32);
+        assert(y->format().dt == DataType::Int32);
+        assert(x_off->format().dt == DataType::Int32);
+        assert(y_off->format().dt == DataType::Int32);
+
+        const std::string width  = tensor->component(TensorComponent::W);
+        const std::string height = tensor->component(TensorComponent::H);
+        const std::string wxh    = tensor->component(TensorComponent::WxH);
+        /*
+        int x_s;
+        int y_s;
+        x_s = (xi_0 + x_k);
+        y_s = (yi_0 + y_k);
+        mi_0 = x_s + y_s * width + b * widthxheight;
+        mi_0 = select(-1, mi_0, x_s >= 0);
+        mi_0 = select(-1, mi_0, y_s >= 0);
+        mi_0 = select(-1, mi_0, x_s < 128);
+        mi_0 = select(-1, mi_0, y_s < 128);
+        */
+        compound_statement_begin();
+        declare_tile("_x_s", TileInfo(DataType::Int32));
+        declare_tile("_y_s", TileInfo(DataType::Int32));
+        auto x_s = operands.unpack(Operand("_x_s"));
+        auto y_s = operands.unpack(Operand("_y_s"));
+        for(int i = 0; i < dst->format().h; ++i)
+        {
+            // x_s = (xi_0 + x_k);
+            // y_s = (yi_0 + y_k);
+            _data->code += x_s->scalar(0, i).str;
+            _data->code += " = (";
+            _data->code += x->scalar(0, i).str;
+            _data->code += " + ";
+            _data->code += x_off->scalar(0, i).str;
+            _data->code += ");\n";
+            _data->code += y_s->scalar(0, i).str;
+            _data->code += " = (";
+            _data->code += y->scalar(0, i).str;
+            _data->code += " + ";
+            _data->code += y_off->scalar(0, i).str;
+            _data->code += ");\n";
+            // mi_0 = x_s + y_s * width;
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = ";
+            _data->code += x_s->scalar(0, i).str;
+            _data->code += " + ";
+            _data->code += y_s->scalar(0, i).str;
+            _data->code += " * " + width + ";\n";
+            // mi_0 = select(wxh, mi_0, x_s >= 0);
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = select(-1, ";
+            _data->code += dst->scalar(0, i).str;
+            _data->code += ", ";
+            _data->code += x_s->scalar(0, i).str;
+            _data->code += " >= 0);\n";
+            // mi_0 = select(wxh, mi_0, y_s >= 0);
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = select(-1, ";
+            _data->code += dst->scalar(0, i).str;
+            _data->code += ", ";
+            _data->code += y_s->scalar(0, i).str;
+            _data->code += " >= 0);\n";
+            // mi_0 = select(wxh, mi_0, x_s < width);
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = select(-1, ";
+            _data->code += dst->scalar(0, i).str;
+            _data->code += ", ";
+            _data->code += x_s->scalar(0, i).str;
+            _data->code += " < ";
+            _data->code += width + ");\n";
+            // mi_0 = select(wxh, mi_0, y_s < height);
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = select(-1, ";
+            _data->code += dst->scalar(0, i).str;
+            _data->code += ", ";
+            _data->code += y_s->scalar(0, i).str;
+            _data->code += " < ";
+            _data->code += height + ");\n";
+        }
+        compound_statement_end();
+    }
+
+private:
+    GpuKernelWriterDataHolder* _data { nullptr };
+    GpuKernelWriterAttribute * _attr { nullptr };
+};
+
+/** IGpuKernelWriter factory class */
+class GpuKernelWriterFactory final
+{
+public:
+    /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return IGpuKernelWriter
+     */
+    static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
+    {
+        switch(x->programming_language())
+        {
+            case GpuTargetLanguage::OpenCL:
+                return std::make_unique<ClKernelWriter>(attr, x);
+            default:
+                std::cout << "Unsupported Gpu programming language" << std::endl;
+                assert(false);
+                return nullptr;
+        }
+    }
+};
+
+inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
+{
+    auto tensor = tensor_info_id->shape;
+
+    int32_t dim[3] = {0};
+
+    switch(tensor_format)
+    {
+        case TensorSamplerFormat::C_W_H:
+        dim[0] = tensor[0];
+        dim[1] = tensor[1];
+        dim[2] = tensor[2];
+        break;
+        case TensorSamplerFormat::C_WH_1:
+        dim[0] = tensor[0];
+        dim[1] = tensor[1] * tensor[2];
+        dim[2] = 1;
+        break;
+        default:
+        std::cout << "Unsupported tensor format" << std::endl;
+        assert(false);
+        break;
+    }
+
+    return std::min(step, dim[idx]);
+}
+
+} // namespace prototype
+} // namespace ckw
+
+#endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H
diff --git a/compute_kernel_writer/prototype/src/TensorInfo.cpp b/compute_kernel_writer/prototype/src/TensorInfo.cpp
new file mode 100644
index 0000000000..561c126469
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/TensorInfo.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/TensorInfo.h"
+
+namespace ckw
+{
+TensorInfo::TensorInfo(DataType dt, const TensorShape &shape, TensorDataLayout dl, int32_t id)
+    : _shape(shape), _dt(dt), _dl(dl), _id(id)
+{
+}
+
+TensorInfo &TensorInfo::shape(const TensorShape &shape)
+{
+    _shape = shape;
+    return *this;
+}
+
+TensorShape TensorInfo::shape() const
+{
+    return _shape;
+}
+
+TensorInfo &TensorInfo::data_type(DataType dt)
+{
+    _dt = dt;
+    return *this;
+}
+
+DataType TensorInfo::data_type() const
+{
+    return _dt;
+}
+
+TensorInfo &TensorInfo::data_layout(TensorDataLayout dl)
+{
+    _dl = dl;
+    return *this;
+}
+
+TensorDataLayout TensorInfo::data_layout() const
+{
+    return _dl;
+}
+
+TensorInfo &TensorInfo::id(int32_t id)
+{
+    _id = id;
+    return *this;
+}
+
+int32_t TensorInfo::id() const
+{
+    return _id;
+}
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TensorOperand.cpp b/compute_kernel_writer/prototype/src/TensorOperand.cpp
new file mode 100644
index 0000000000..00ecc3824e
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/TensorOperand.cpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/TensorOperand.h"
+#include "ckw/Error.h"
+#include "ckw/Kernel.h"
+#include "ckw/TileOperand.h"
+#include "src/Prototype.h"
+
+namespace ckw
+{
+
+namespace
+{
+
+inline TensorComponentOperand &get_or_create_component(std::unique_ptr<TensorComponentOperand> &ptr, const ::std::string &name, TensorComponent component)
+{
+    if(ptr == nullptr)
+    {
+        ptr = std::make_unique<TensorComponentOperand>(name, component);
+    }
+
+    return *ptr;
+}
+
+} // namespace
+
+// =================================================================================================
+// TensorOperand
+// =================================================================================================
+
+TensorOperand::TensorOperand(const std::string &name, const TensorInfo &info)
+    : OperandBase(name), _info(info)
+{
+}
+
+prototype::Operand TensorOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
+{
+    CKW_UNUSED(writer);
+    return { name() };
+}
+
+const TensorInfo &TensorOperand::info() const
+{
+    return _info;
+}
+
+TensorInfo &TensorOperand::info()
+{
+    return _info;
+}
+
+DataType TensorOperand::data_type() const
+{
+    return _info.data_type();
+}
+
+bool TensorOperand::is_constant() const
+{
+    return false;
+}
+
+const TileOperand &TensorOperand::tile() const
+{
+    return *_tile;
+}
+
+TileOperand &TensorOperand::tile()
+{
+    return *_tile;
+}
+
+TensorOperand &TensorOperand::tile(TileOperand &tile)
+{
+    _tile = &tile;
+    return *this;
+}
+
+const TensorTileSampler &TensorOperand::tile_sampler() const
+{
+    return _tile_sampler;
+}
+
+TensorTileSampler &TensorOperand::tile_sampler()
+{
+    return _tile_sampler;
+}
+
+TensorOperand &TensorOperand::tile_sampler(const TensorTileSampler &value)
+{
+    _tile_sampler = value;
+    return *this;
+}
+
+TileOperand &TensorOperand::stride1()
+{
+    return get_or_create_component(_stride1, name(), TensorComponent::Stride1);
+}
+
+TileOperand &TensorOperand::stride2()
+{
+    return get_or_create_component(_stride2, name(), TensorComponent::Stride2);
+}
+
+TileOperand &TensorOperand::stride3()
+{
+    return get_or_create_component(_stride3, name(), TensorComponent::Stride3);
+}
+
+TileOperand &TensorOperand::stride4()
+{
+    return get_or_create_component(_stride4, name(), TensorComponent::Stride4);
+}
+
+TileOperand &TensorOperand::dim0()
+{
+    return get_or_create_component(_dim0, name(), TensorComponent::Dim0);
+}
+
+TileOperand &TensorOperand::dim1()
+{
+    return get_or_create_component(_dim1, name(), TensorComponent::Dim1);
+}
+
+TileOperand &TensorOperand::dim2()
+{
+    return get_or_create_component(_dim2, name(), TensorComponent::Dim2);
+}
+
+TileOperand &TensorOperand::dim3()
+{
+    return get_or_create_component(_dim3, name(), TensorComponent::Dim3);
+}
+
+TileOperand &TensorOperand::dim4()
+{
+    return get_or_create_component(_dim4, name(), TensorComponent::Dim4);
+}
+
+TileOperand &TensorOperand::dim1_dim2()
+{
+    return get_or_create_component(_dim1_dim2, name(), TensorComponent::Dim1xDim2);
+}
+
+TileOperand &TensorOperand::dim1_dim2_dim3()
+{
+    return get_or_create_component(_dim1_dim2_dim3, name(), TensorComponent::Dim1xDim2xDim3);
+}
+
+TileOperand &TensorOperand::offset_first_element_in_bytes()
+{
+    return get_or_create_component(_offset_first_element_in_bytes, name(), TensorComponent::OffsetFirstElement);
+}
+
+// =================================================================================================
+// TensorComponentOperand
+// =================================================================================================
+
+TensorComponentOperand::TensorComponentOperand(const ::std::string &name, TensorComponent component)
+    : TileOperand(name, DataType::Int32), _component(component)
+{
+}
+
+prototype::Operand TensorComponentOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
+{
+    CKW_UNUSED(writer);
+    prototype::OperandType type{ prototype::OperandType::Unknown };
+
+    switch(_component)
+    {
+        case TensorComponent::OffsetFirstElement:
+            type = prototype::OperandType::TensorDataOffset;
+            break;
+
+        case TensorComponent::Stride1:
+            type = prototype::OperandType::TensorStride1;
+            break;
+
+        case TensorComponent::Stride2:
+            type = prototype::OperandType::TensorStride2;
+            break;
+
+        case TensorComponent::Stride3:
+            type = prototype::OperandType::TensorStride3;
+            break;
+
+        case TensorComponent::Stride4:
+            type = prototype::OperandType::TensorStride4;
+            break;
+
+        case TensorComponent::Dim0:
+            type = prototype::OperandType::TensorDim0;
+            break;
+
+        case TensorComponent::Dim1:
+            type = prototype::OperandType::TensorDim1;
+            break;
+
+        case TensorComponent::Dim2:
+            type = prototype::OperandType::TensorDim2;
+            break;
+
+        case TensorComponent::Dim3:
+            type = prototype::OperandType::TensorDim3;
+            break;
+
+        case TensorComponent::Dim4:
+            type = prototype::OperandType::TensorDim4;
+            break;
+
+        case TensorComponent::Dim1xDim2:
+            type = prototype::OperandType::TensorDim1xDim2;
+            break;
+
+        case TensorComponent::Dim1xDim2xDim3:
+            type = prototype::OperandType::TensorDim1xDim2xDim3;
+            break;
+
+        default:
+            CKW_ASSERT(false);
+    }
+
+    return prototype::Operand(name(), type);
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
new file mode 100644
index 0000000000..143d550dec
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/TensorTileSampler.h"
+#include "ckw/TileOperand.h"
+#include "ckw/Types.h"
+
+namespace ckw
+{
+
+TensorTileSampler::TensorTileSampler()
+{
+}
+
+TensorTileSampler::TensorTileSampler(
+    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
+    TensorSamplerFormat       format,
+    TensorSamplerAddressModeX address_mode_x,
+    TensorSamplerAddressModeY address_mode_y,
+    TensorSamplerAddressModeZ address_mode_z)
+    : _x(&x), _y(&y), _z(&z), _b(&b), _height(0), _width(0), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
+{
+}
+
+TensorTileSampler::TensorTileSampler(
+    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
+    int32_t height, int32_t width,
+    TensorSamplerFormat       format,
+    TensorSamplerAddressModeX address_mode_x,
+    TensorSamplerAddressModeY address_mode_y,
+    TensorSamplerAddressModeZ address_mode_z)
+    : _x(&x), _y(&y), _z(&z), _b(&b), _height(height), _width(width), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
+{
+}
+
+const TileOperand &TensorTileSampler::x() const
+{
+    return *_x;
+}
+
+TensorTileSampler &TensorTileSampler::x(TileOperand &x)
+{
+    _x = &x;
+    return *this;
+}
+
+const TileOperand &TensorTileSampler::y() const
+{
+    return *_y;
+}
+
+TensorTileSampler &TensorTileSampler::y(TileOperand &y)
+{
+    _y = &y;
+    return *this;
+}
+
+const TileOperand &TensorTileSampler::z() const
+{
+    return *_z;
+}
+
+TensorTileSampler &TensorTileSampler::z(TileOperand &z)
+{
+    _z = &z;
+    return *this;
+}
+
+const TileOperand &TensorTileSampler::b() const
+{
+    return *_b;
+}
+
+TensorTileSampler &TensorTileSampler::b(TileOperand &b)
+{
+    _b = &b;
+    return *this;
+}
+
+int32_t TensorTileSampler::width() const
+{
+    return _width;
+}
+
+TensorTileSampler &TensorTileSampler::width(int32_t width)
+{
+    _width = width;
+    return *this;
+}
+
+int32_t TensorTileSampler::height() const
+{
+    return _height;
+}
+
+TensorTileSampler &TensorTileSampler::height(int32_t height)
+{
+    _height = height;
+    return *this;
+}
+
+TensorSamplerFormat TensorTileSampler::format() const
+{
+    return _format;
+}
+
+TensorTileSampler &TensorTileSampler::format(TensorSamplerFormat format)
+{
+    _format = format;
+    return *this;
+}
+
+TensorSamplerAddressModeX TensorTileSampler::address_mode_x() const
+{
+    return _address_mode_x;
+}
+
+TensorTileSampler &TensorTileSampler::address_mode_x(TensorSamplerAddressModeX address_mode_x)
+{
+    _address_mode_x = address_mode_x;
+    return *this;
+}
+
+TensorSamplerAddressModeY TensorTileSampler::address_mode_y() const
+{
+    return _address_mode_y;
+}
+
+TensorTileSampler &TensorTileSampler::address_mode_y(TensorSamplerAddressModeY address_mode_y)
+{
+    _address_mode_y = address_mode_y;
+    return *this;
+}
+
+TensorSamplerAddressModeZ TensorTileSampler::address_mode_z() const
+{
+    return _address_mode_z;
+}
+
+TensorTileSampler &TensorTileSampler::address_mode_z(TensorSamplerAddressModeZ address_mode_z)
+{
+    _address_mode_z = address_mode_z;
+    return *this;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TileInfo.cpp b/compute_kernel_writer/prototype/src/TileInfo.cpp
new file mode 100644
index 0000000000..7d8b2654ef
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/TileInfo.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/TileInfo.h"
+
+namespace ckw
+{
+TileInfo::TileInfo(DataType dt)
+    : _dt(dt), _shape({{1, 1}})
+{
+}
+
+TileInfo::TileInfo(DataType dt, int32_t w)
+    : _dt(dt), _shape({{w, 1}})
+{
+}
+
+TileInfo::TileInfo(DataType dt, int32_t h, int32_t w)
+    : _dt(dt), _shape({{w, h}})
+{
+}
+
+TileInfo &TileInfo::width(int32_t w)
+{
+    _shape[kTileWidthIdx] = w;
+    return *this;
+}
+
+int32_t TileInfo::width() const
+{
+    return _shape[kTileWidthIdx];
+}
+
+TileInfo &TileInfo::height(int32_t h)
+{
+    _shape[kTileHeightIdx] = h;
+    return *this;
+}
+
+int32_t TileInfo::height() const
+{
+    return _shape[kTileHeightIdx];
+}
+
+TileInfo &TileInfo::data_type(DataType dt)
+{
+    _dt = dt;
+    return *this;
+}
+
+DataType TileInfo::data_type() const
+{
+    return _dt;
+}
+} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
new file mode 100644
index 0000000000..091947628d
--- /dev/null
+++ b/compute_kernel_writer/prototype/src/TileOperand.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ckw/TileOperand.h"
+#include "ckw/Error.h"
+#include "src/Prototype.h"
+
+namespace ckw
+{
+
+TileOperand::TileOperand(const std::string &name, const TileInfo &info)
+    : OperandBase(name), _info(info), _value{ 0 }, _constant(false)
+{
+}
+
+TileOperand::TileOperand(const std::string &name, DataType data_type)
+    : OperandBase(name), _info(TileInfo{ data_type }), _value(0), _constant(false)
+{
+}
+
+TileOperand::TileOperand(const std::string &name, int32_t value)
+    : OperandBase(name), _info(TileInfo{ DataType::Int32 }), _value(value), _constant(true)
+{
+}
+
+TileOperand::TileOperand(const std::string &name, float value)
+    : OperandBase(name), _info(TileInfo{ DataType::Fp32 }), _value(value), _constant(true)
+{
+}
+
+prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
+{
+    CKW_UNUSED(writer);
+
+    if(_constant)
+    {
+        switch(_info.data_type())
+        {
+            case DataType::Int32:
+                return prototype::Operand(std::to_string(_value.get<int32_t>()), prototype::OperandType::ScalarInt32);
+
+            case DataType::Fp32:
+                return prototype::Operand(std::to_string(_value.get<float>()), prototype::OperandType::ScalarFp32);
+
+            default:
+                CKW_ASSERT(false);
+        }
+    }
+    else
+    {
+        return prototype::Operand(name(), prototype::OperandType::Tile);
+    }
+}
+
+const TileInfo &TileOperand::tile_info() const
+{
+    return _info;
+}
+
+DataType TileOperand::data_type() const
+{
+    return _info.data_type();
+}
+
+bool TileOperand::is_constant() const
+{
+    return _constant;
+}
+
+bool TileOperand::is_scalar() const
+{
+    return _info.width() == 1 && _info.height() == 1;
+}
+
+ScalarValue TileOperand::scalar_value() const
+{
+    CKW_ASSERT(is_scalar());
+    CKW_ASSERT(is_constant());
+
+    return _value;
+}
+
+} // namespace ckw
diff --git a/compute_kernel_writer/src/Kernel.cpp b/compute_kernel_writer/src/Kernel.cpp
deleted file mode 100644
index bbf5c440a7..0000000000
--- a/compute_kernel_writer/src/Kernel.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/Kernel.h"
-#include "ckw/Types.h"
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-Kernel::Kernel(const char *name, GpuTargetLanguage language)
-    : _name(name), _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)), _operands{}
-{
-}
-
-Kernel::~Kernel()
-{
-}
-
-const std::string &Kernel::name() const
-{
-    return _name;
-}
-
-const std::map<std::string, std::unique_ptr<OperandBase>> &Kernel::operands() const
-{
-    return _operands;
-}
-
-std::map<std::string, std::unique_ptr<OperandBase>> &Kernel::operands()
-{
-    return _operands;
-}
-
-prototype::GpuKernelWriterDataHolder *Kernel::impl()
-{
-    return _kernel.get();
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/src/KernelWriter.cpp b/compute_kernel_writer/src/KernelWriter.cpp
deleted file mode 100644
index 5d79985e87..0000000000
--- a/compute_kernel_writer/src/KernelWriter.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/KernelWriter.h"
-#include "ckw/Error.h"
-#include "ckw/TensorOperand.h"
-#include "src/Prototype.h"
-
-#include <sstream>
-
-namespace ckw
-{
-
-namespace
-{
-
-inline prototype::TensorInfo create_impl_tensor_info(const TensorInfo &info)
-{
-    return prototype::TensorInfo{ info.shape(), info.data_type(), info.data_layout(), info.id() };
-}
-
-} // namespace
-
-// =================================================================================================
-// Constructors and destructor
-// =================================================================================================
-
-KernelWriter::KernelWriter(Kernel &kernel)
-    : _kernel(&kernel),
-      _impl_attr(std::make_unique<prototype::GpuKernelWriterAttribute>()),
-      _impl(prototype::GpuKernelWriterFactory::create(_impl_attr.get(), kernel.impl()))
-{
-    _impl->set_IdSpace(1);
-}
-
-KernelWriter::~KernelWriter()
-{
-}
-
-// =================================================================================================
-// Scope management
-// =================================================================================================
-
-int32_t KernelWriter::id_space() const
-{
-    return _id_space;
-}
-
-KernelWriter &KernelWriter::id_space(int32_t id_space)
-{
-    CKW_ASSERT(id_space <= _max_id_space);
-
-    _id_space = id_space;
-    return *this;
-}
-
-int32_t KernelWriter::next_id_space()
-{
-    id_space(++_max_id_space);
-    return _id_space;
-}
-
-// =================================================================================================
-// Tensor and tile declaration
-// =================================================================================================
-
-TensorOperand &KernelWriter::create_tensor_argument(const char *name, const TensorInfo &info)
-{
-    const auto var_name = generate_variable_name(name);
-
-    _impl->declare_argument(var_name, create_impl_tensor_info(info));
-
-    auto operand = new TensorOperand(var_name, info);
-    register_operand(operand, false);
-
-    return *operand;
-}
-
-TileOperand &KernelWriter::create_tile_argument(const char *name, int32_t value)
-{
-    const auto var_name = generate_variable_name(name);
-
-    auto operand = new TileOperand(var_name, value);
-    register_operand(operand, false);
-
-    return *operand;
-}
-
-std::string KernelWriter::generate_variable_name(const char *name) const
-{
-    std::stringstream var_name;
-
-    var_name << "_" << _id_space << "_" << name;
-
-    return var_name.str();
-}
-
-void KernelWriter::register_operand(OperandBase *operand, bool declaring)
-{
-    const auto &name     = operand->name();
-    auto       &operands = _kernel->operands();
-
-    CKW_ASSERT(operands.find(name) == operands.end());
-    operands[name] = std::unique_ptr<OperandBase>(operand);
-
-    if(declaring && !operand->is_constant())
-    {
-        const auto tile = reinterpret_cast<TileOperand *>(operand);
-
-        const auto &info = tile->tile_info();
-        _impl->declare_tile(tile->name(), prototype::TileInfo(info.data_type(), info.width(), info.height()));
-    }
-}
-
-// =================================================================================================
-// Load and store
-// =================================================================================================
-
-void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::GpuSamplerTensorStorage::BufferUint8Ptr,
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
-
-    auto impl_x = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b = sampler.b().create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
-}
-
-void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{
-            sampler.format(),
-            prototype::GpuSamplerTensorStorage::BufferUint8Ptr,
-            sampler.address_mode_x(),
-            sampler.address_mode_y(),
-            sampler.address_mode_z() });
-    auto impl_src = tile.create_impl_operand(_impl.get());
-    auto impl_x   = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y   = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z   = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b   = sampler.b().create_impl_operand(_impl.get());
-
-    _impl->op_store_immediate(impl_tensor, impl_src, impl_x, impl_y, impl_z, impl_b);
-}
-
-// =================================================================================================
-// Data processing
-// =================================================================================================
-
-void KernelWriter::op_assign(TileOperand &dst, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_assign(impl_dst, impl_src);
-}
-
-void KernelWriter::op_binary_expression(TileOperand &dst, const TileOperand &lhs, const TileOperand &rhs, BinaryOp op)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-
-    _impl->op_binary_expression(impl_dst, impl_lhs, op, impl_rhs);
-}
-
-void KernelWriter::op_scalar_function(TileOperand &dst, const TileOperand &src, ScalarUnaryFunction opcode)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_scalar_function(impl_dst, impl_src, opcode);
-}
-
-// =================================================================================================
-// Misc
-// =================================================================================================
-
-void KernelWriter::op_get_global_id(TileOperand &dst, int32_t dim)
-{
-    _impl->op_get_global_id(prototype::Operand(dst.name()), dim);
-}
-
-// =================================================================================================
-// Code generation
-// =================================================================================================
-
-std::string KernelWriter::generate_code()
-{
-    return prototype::generate_code(*_kernel->impl(), _kernel->name());
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/src/OperandBase.cpp b/compute_kernel_writer/src/OperandBase.cpp
deleted file mode 100644
index 59cf846cc7..0000000000
--- a/compute_kernel_writer/src/OperandBase.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/OperandBase.h"
-
-namespace ckw
-{
-
-OperandBase::OperandBase(const std::string &name)
-    : _name(name)
-{
-}
-
-OperandBase::~OperandBase()
-{
-}
-
-const std::string &OperandBase::name() const
-{
-    return _name;
-}
-
-OperandBase &OperandBase::name(const std::string &name)
-{
-    _name = name;
-    return *this;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/src/Prototype.h b/compute_kernel_writer/src/Prototype.h
deleted file mode 100644
index 45f1b3d464..0000000000
--- a/compute_kernel_writer/src/Prototype.h
+++ /dev/null
@@ -1,3767 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_SRC_PROTOTYPE_H
-#define CKW_SRC_PROTOTYPE_H
-
-#include <vector>
-#include <map>
-#include <string>
-#include <cstdint> // int32_t
-#include <iostream> // cout (to be removed)
-#include <cassert>  // assert (to be removed)
-#include <unordered_map>
-#include <chrono>
-#include <cmath>
-#include <memory>
-#include <algorithm>
-#include <array>
-#include <stdexcept>
-
-#include "ckw/Types.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/Error.h"
-
-namespace ckw
-{
-namespace prototype {
-
-// Dummy data structure for Size2D
-using Size2D = std::vector<int32_t>;
-
-// Dummy Status
-using Status = void;
-
-enum class ComponentType : int32_t
-{
-    Complex   = 0,
-    Simple    = 1,
-    Unfusable = 2
-};
-
-enum class GpuCompilationSpeed
-{
-    Fast   = 0x00,  // fast compilation may increase the latency of the network
-    Slow   = 0x01   // slow compilation may decrease the latency of the network
-};
-
-enum class GpuExtensions
-{
-    Fp16,
-    Dot8,
-    Mmul,
-    FastMath
-};
-
-struct TensorInfo
-{
-    TensorShape shape { {0} };
-    DataType    data_type { DataType::Unknown };
-    TensorDataLayout  data_layout { TensorDataLayout::Nhwc };
-    int32_t     id { -1 };
-};
-
-struct ComponentAttribute
-{
-    GpuCompilationSpeed compilation_speed {GpuCompilationSpeed::Fast};
-    bool                overwrite_tile { true };
-};
-
-inline std::string data_type_to_cl_type(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::Fp32:
-            return "float";
-        case DataType::Fp16:
-            return "half";
-        case DataType::Int8:
-            return "char";
-        case DataType::Uint8:
-            return "uchar";
-        case DataType::Uint16:
-            return "ushort";
-        case DataType::Int16:
-            return "short";
-        case DataType::Uint32:
-            return "uint";
-        case DataType::Int32:
-            return "int";
-        case DataType::Bool:
-            return "bool";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline int32_t width_to_cl_vector_size(int32_t width)
-{
-    switch(width)
-    {
-        case 1:
-            return 1;
-        case 2:
-            return 2;
-        case 3:
-            return 3;
-        case 4:
-            return 4;
-        case 5:
-        case 6:
-        case 7:
-        case 8:
-            return 8;
-        case 9:
-        case 10:
-        case 11:
-        case 12:
-        case 13:
-        case 14:
-        case 15:
-        case 16:
-            return 16;
-        default:
-            assert(false);
-            return 0;
-    }
-}
-
-inline std::string get_cl_data_type(DataType dt, int32_t width)
-{
-    std::string data_type;
-    int32_t w = width_to_cl_vector_size(width);
-    data_type += data_type_to_cl_type(dt);
-    if(w != 1)
-    {
-        data_type += std::to_string(w);
-    }
-    return data_type;
-}
-
-inline std::string to_opencl_store(int32_t vector_length)
-{
-    if(vector_length != 1)
-    {
-        return "vstore" + std::to_string(vector_length) + "(";
-    }
-    else
-    {
-        return "*(";
-    }
-}
-
-struct TileInfo
-{
-    TileInfo() {}
-    TileInfo(DataType dt) : dt(dt), w(1), h(1) {}
-    TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) {}
-    TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) {}
-    DataType dt{ DataType::Unknown }; // Data type of the tile
-    int32_t  w{ 0 };  // Width (i.e. c0 - portion of the channels)
-    int32_t  h{ 0 };  // Height (i.e. s0 - portion of the spatial dimensions)
-};
-
-inline std::ostream& operator << (std::ostream& o, const TileInfo& a)
-{
-    o << a.w << " x " << a.h;
-    return o;
-}
-
-struct DataTypeAsString
-{
-    std::string str { "" };
-    DataType    dt { DataType::Unknown };
-    int32_t     size { 1 };
-};
-
-struct ValueAsString
-{
-    std::string      str { "" };
-    DataTypeAsString type { };
-};
-
-// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
-// A Tile is a collection of variables used to express a 2D data.
-class IScalarTile
-{
-public:
-    virtual ~IScalarTile() = default;
-    /** Method to get the scalar variable from a tile
-     * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     *
-     * @return the scalar variable as a string
-     */
-    virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
-    /** Method to get the list of underlying variable names used by the tile
-     *
-     * @return the list of variable names
-     */
-    virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
-    /** Method to get the name of the tile.
-     *
-     * @return the name of the tile
-     */
-    std::string name() const
-    {
-        return _basename;
-    }
-    /** Method to get the tile format
-     *
-     * @return the format
-     */
-    TileInfo format() const
-    {
-        return _format;
-    }
-    /** Method to know whether the tile is assignable or not (constant)
-     *
-     * @return true if the tile is assignable
-     */
-    virtual bool is_assignable() const = 0;
-    /** Method to know whether the tile needs to be declared
-     *
-     * @return true if the tile needs to be declared in the code before being used
-     */
-    virtual bool need_declaration() const = 0;
-protected:
-    TileInfo  _format { };       // Tile format
-    std::string _basename { "" };  // Tile name
-};
-
-// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
-// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
-class IVectorTile : public IScalarTile
-{
-public:
-    virtual ~IVectorTile() = default;
-    /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
-     *  The user can query the list of supported width for the vectors through preferred_vector_sizes().
-     *
-     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     *
-     * @return the vector variable as a string
-     */
-    virtual ValueAsString vector(int32_t y) const = 0;
-    /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
-     *
-     * @return the vector variable as a string
-     */
-    virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
-    /** Method to get the preferred vector sizes.
-     *
-     * @return a vector with the preferred vector sizes
-     */
-    //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
-};
-
-class ClTile : public IVectorTile
-{
-public:
-    ClTile(const std::string& name, TileInfo format)
-    {
-        _format = format;
-        _basename = name;
-    }
-
-    ValueAsString scalar(int32_t x, int32_t y) const override
-    {
-        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, 1);
-        t.type.dt   = _format.dt;
-        t.type.size = 1;
-
-        // Check required because if the width has only one element, we cannot use .s0
-        if(_format.w != 1)
-        {
-            // Automatic broadcasting
-            t.str += ".s" + std::to_string(x);
-        }
-
-        return t;
-    }
-
-    ValueAsString vector(int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, _format.w);
-        t.type.dt   = _format.dt;
-        t.type.size = _format.w;
-        return t;
-    }
-
-    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, width);
-        t.type.dt   = _format.dt;
-        t.type.size = width;
-
-        if(_format.w != 1)
-        {
-            t.str += ".s";
-            for(int i = 0; i < width; ++i)
-            {
-                t.str += to_scalar_hex(x_start + i);
-            }
-        }
-        return t;
-    }
-
-    std::vector<ValueAsString> underlying_source_variables() const override
-    {
-        std::vector<ValueAsString> vars;
-        for(int32_t y = 0; y < _format.h; ++y)
-        {
-            ValueAsString t;
-            t.str       = build_variable_name(y);
-            t.type.str  = get_cl_data_type(_format.dt, _format.w);
-            t.type.dt   = _format.dt;
-            t.type.size = _format.w;
-            vars.push_back(t);
-        }
-        return vars;
-    }
-
-    bool is_assignable() const override
-    {
-        return true;
-    }
-
-    bool need_declaration() const override
-    {
-        return true;
-    }
-
-private:
-    std::string build_variable_name(int32_t y) const
-    {
-        std::string var_name = _basename;
-
-        if(_format.h == 1)
-        {
-            return var_name;
-
-        }
-        else
-        {
-            var_name += "_";
-            var_name += std::to_string(y);
-        }
-
-        return var_name;
-    }
-
-    std::string to_scalar_hex(int32_t x) const
-    {
-        switch(x)
-        {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-            case 8:
-            case 9:
-                return std::to_string(x);
-            case 10:
-                return "A";
-            case 11:
-                return "B";
-            case 12:
-                return "C";
-            case 13:
-                return "D";
-            case 14:
-                return "E";
-            case 15:
-                return "F";
-            default:
-                std::cout << "Unsupported hexadecimal value" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-};
-
-// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
-class ClConstantTile : public IVectorTile
-{
-public:
-    ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        _format.w  = in[0].size();
-        _format.h  = in.size();
-        _format.dt = dt;
-
-        _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
-
-        for(int32_t y = 0; y < _format.h; ++y)
-        {
-            for(int32_t x = 0; x < _format.w; ++x)
-            {
-                _data[y][x] = in[y][x];
-            }
-        }
-    }
-
-    ValueAsString scalar(int32_t x, int32_t y) const override
-    {
-        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = _data[y][x];
-        t.type.str  = get_cl_data_type(_format.dt, 1);
-        t.type.dt   = _format.dt;
-        t.type.size = 1;
-
-        return t;
-    }
-
-    ValueAsString vector(int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        return vector(0, _format.w, y);
-    }
-
-    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = "";
-        t.type.str  = get_cl_data_type(_format.dt, width);
-        t.type.dt   = _format.dt;
-        t.type.size = width;
-
-        if(width > 1)
-        {
-            t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
-        }
-
-        int32_t x = x_start;
-        for(; x < width - 1; ++x)
-        {
-            t.str += scalar(x, y).str;
-            t.str += ", ";
-        }
-        t.str += scalar(x, y).str;
-
-        if(width > 1)
-        {
-            t.str += "))";
-        }
-
-        return t;
-    }
-
-    std::vector<ValueAsString> underlying_source_variables() const override
-    {
-        std::vector<ValueAsString> vars;
-
-        for(int32_t y = 0; y < _format.h; ++y)
-        {
-            for(int32_t x = 0; x < _format.w; ++x)
-            {
-                ValueAsString t;
-                t.str       = _data[y][x];
-                t.type.str  = get_cl_data_type(_format.dt, 1);
-                t.type.dt   = _format.dt;
-                t.type.size = 1;
-                vars.push_back(t);
-            }
-        }
-
-        return vars;
-    }
-
-    bool is_assignable() const override
-    {
-        return false;
-    }
-
-    bool need_declaration() const override
-    {
-        return false;
-    }
-
-private:
-    std::vector<std::vector<std::string>> _data{};
-};
-
-enum class TensorComponentIndex : int32_t
-{
-    IndexMask = 0x0000000f,
-};
-
-enum class TensorComponentType : int32_t
-{
-    OffsetFirstElement = 0x00000100,
-    Stride             = 0x00001000,
-    Dimension          = 0x00010000,
-    FoldedDimension    = 0x00100000,
-    Constant           = 0x01000000
-};
-
-enum class TensorComponent : int32_t
-{
-    Unknown             = 0x00000000,
-    OffsetFirstElement  = 0x00000100,
-    Stride1             = 0x00001001,
-    Stride2             = 0x00001002,
-    Stride3             = 0x00001003,
-    Stride4             = 0x00001004,
-    Dim0                = 0x00010000,
-    Dim1                = 0x00010001,
-    Dim2                = 0x00010002,
-    Dim3                = 0x00010003,
-    Dim4                = 0x00010004,
-    C                   = 0x00010000,   // Dim0
-    W                   = 0x00010001,   // Dim1
-    H                   = 0x00010002,   // Dim2
-    D                   = 0x00010003,
-    N                   = 0x00010004,
-    Dim1xDim2           = 0x00100021,
-    Dim1xDim2xDim3      = 0x00100321,
-    WxH                 = 0x00100021,
-    WxHxD               = 0x00100321
-};
-
-inline std::string to_string(TensorComponent x)
-{
-    switch(x)
-    {
-        case TensorComponent::Unknown:
-        return "Unknown";
-        case TensorComponent::OffsetFirstElement:
-        return "OffsetFirstElement";
-        case TensorComponent::Stride1:
-        return "Stride1";
-        case TensorComponent::Stride2:
-        return "Stride2";
-        case TensorComponent::Stride3:
-        return "Stride3";
-        case TensorComponent::Stride4:
-        return "Stride4";
-        case TensorComponent::Dim0:
-        return "Dim0";
-        case TensorComponent::Dim1:
-        return "Dim1";
-        case TensorComponent::Dim2:
-        return "Dim2";
-        case TensorComponent::Dim3:
-        return "Dim3";
-        case TensorComponent::Dim4:
-        return "Dim4";
-        case TensorComponent::Dim1xDim2:
-        return "Dim1xDim2";
-        case TensorComponent::Dim1xDim2xDim3:
-        return "Dim1xDim2xDim3";
-        default:
-        assert(false);
-    }
-}
-
-class ITensorArgument
-{
-public:
-    virtual ~ITensorArgument() = default;
-    /** Method to get the tensor component as a string
-     *
-     * @param[in] x tensor component to query
-     *
-     * @return  the tensor component as a string
-     */
-    virtual std::string component(TensorComponent x) = 0;
-    /** Method to get the tensor component type declaration as a string
-     *
-     * @return  the tensor component type declaration as a string
-     */
-    virtual std::string component_type_declaration() const = 0;
-    /** Method to get the tensor component data type
-     *
-     * @return  the tensor component data type
-     */
-    virtual DataType component_data_type() const = 0;
-    /** Method to get the tensor component declarations
-     *
-     * @return a vector containing the tensor component declarations
-     */
-    virtual std::vector<TensorComponent> component_declarations() const = 0;
-    /** Method to get the name of the tensor argument.
-     *
-     * @return the name of the tensor argument
-     */
-    std::string name() const
-    {
-        return _basename;
-    }
-    /** Method to get the tensor format
-     *
-     * @return the format
-     */
-    TensorInfo format() const
-    {
-        return _format;
-    }
-
-protected:
-    TensorInfo   _format { };
-    std::string  _basename {};
-};
-
-enum class GpuTensorStorage : int32_t
-{
-    Unknown          = 0x0000,
-    BufferUint8Ptr   = 0x0012,
-    Image2dReadOnly  = 0x0020,
-    Image2dWriteOnly = 0x0021,
-    Image3dReadOnly  = 0x0030,
-    Image3dWriteOnly = 0x0031
-};
-
-class IGpuTensorArgument : public ITensorArgument
-{
-public:
-    virtual ~IGpuTensorArgument() = default;
-    /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
-     *
-     * @param[in] x tensor storage to query
-     *
-     * @return  the tensor storage as a string
-     */
-    virtual std::string storage(GpuTensorStorage x) = 0;
-    /** Method to get the tensor storage type declaration as a string
-     *
-     * @param[in] x tensor component to query
-     *
-     * @return  the tensor storage type declaration as a string
-     */
-    virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
-    /** Method to get the tensor storage declarations
-     *
-     * @return a vector containing the tensor storage declarations
-     */
-    virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
-};
-
-class ClTensorArgument : public IGpuTensorArgument
-{
-public:
-    ClTensorArgument(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
-    {
-        _basename = name;
-        _format   = x;
-        _return_by_value_when_possible = return_by_value_when_possible;
-    }
-
-    // Methods to override
-    std::string component(TensorComponent x) override
-    {
-        if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Constant)))
-        {
-            int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
-            return std::to_string(idx - 1);
-        }
-
-        if(_return_by_value_when_possible)
-        {
-            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::Dimension)))
-            {
-                int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
-                return std::to_string(_format.shape[idx]);
-            }
-
-            if((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentType::FoldedDimension)))
-            {
-                switch(x)
-                {
-                    case TensorComponent::Dim1xDim2:
-                    return std::to_string(_format.shape[1] * _format.shape[2]);
-                    case TensorComponent::Dim1xDim2xDim3:
-                    return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
-                    default:
-                    std::cout << "Unsupported folded dimension" << std::endl;
-                    assert(false);
-                }
-            }
-        }
-
-        if(std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
-        {
-            _components_required.push_back(x);
-        }
-
-        return build_component_name(x);
-    }
-
-    std::string component_type_declaration() const override
-    {
-        return "int";
-    };
-
-    DataType component_data_type() const override
-    {
-        return DataType::Int32;
-    }
-
-    std::string storage(GpuTensorStorage x) override
-    {
-        if(std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
-        {
-            _storage_required.push_back(x);
-        }
-
-        return build_storage_name(x);
-    }
-
-    std::string storage_type_declaration(GpuTensorStorage x) const override
-    {
-        switch(x)
-        {
-            case GpuTensorStorage::BufferUint8Ptr:
-                return "__global uchar*";
-            case GpuTensorStorage::Image2dReadOnly:
-                return "__read_only image2d_t";
-            case GpuTensorStorage::Image2dWriteOnly:
-                return "__write_only image2d_t";
-            case GpuTensorStorage::Image3dReadOnly:
-                return "__read_only image3d_t ";
-            case GpuTensorStorage::Image3dWriteOnly:
-                return "__write_only image3d_t ";
-            default:
-                std::cout << "Unsupported storage" << std::endl;
-                assert(false);
-                return "";
-        }
-    };
-
-    std::vector<GpuTensorStorage> storage_declarations() const override
-    {
-        return _storage_required;
-    }
-
-    std::vector<TensorComponent> component_declarations() const override
-    {
-        return _components_required;
-    }
-
-private:
-    std::string build_storage_name(GpuTensorStorage x) const
-    {
-        std::string var_name = _basename;
-
-        switch(x)
-        {
-            case GpuTensorStorage::BufferUint8Ptr:
-                return var_name + "_ptr";
-            case GpuTensorStorage::Image2dReadOnly:
-            case GpuTensorStorage::Image2dWriteOnly:
-                return var_name + "_img2d";
-            case GpuTensorStorage::Image3dReadOnly:
-            case GpuTensorStorage::Image3dWriteOnly:
-                return var_name + "_img3d";
-            default:
-                std::cout << "Unsupported storage" << std::endl;
-                assert(false);
-        }
-
-        return var_name;
-    }
-
-    std::string build_component_name(TensorComponent x) const
-    {
-        std::string var_name = _basename;
-
-        switch(x)
-        {
-            case TensorComponent::OffsetFirstElement:
-                return var_name + "_offset_first_element";
-            case TensorComponent::Stride1:
-                return var_name + "_stride1";
-            case TensorComponent::Stride2:
-                return var_name + "_stride2";
-            case TensorComponent::Stride3:
-                return var_name + "_stride3";
-            case TensorComponent::Dim0:
-                return var_name + "_dim0";
-            case TensorComponent::Dim1:
-                return var_name + "_dim1";
-            case TensorComponent::Dim2:
-                return var_name + "_dim2";
-            case TensorComponent::Dim3:
-                return var_name + "_dim3";
-            case TensorComponent::Dim1xDim2:
-                return var_name + "_dim1xdim2";
-            case TensorComponent::Dim1xDim2xDim3:
-                return var_name + "_dim1xdim2xdim3";
-            default:
-                std::cout << "Unsupported component" << std::endl;
-                assert(false);
-        }
-
-        return var_name;
-    }
-
-    bool                          _return_by_value_when_possible { false };
-    std::vector<GpuTensorStorage> _storage_required {};
-    std::vector<TensorComponent>  _components_required {};
-};
-
-/**
- * @brief Data structure that contains the declared tiles by the components.
- * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
- * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
- * and remove (pop) all the tiles from the level above.
- * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
- * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tiles among different components.
- *
- */
-class GpuTileRegistry
-{
-public:
-enum class RegistryTileType
-{
-    Tile,
-    Link
-};
-
-using RegistryIdSpace            = int32_t;
-using RegistryLevel              = int32_t;
-using RegistryTileName           = std::string;
-
-struct RegistryTileTableEntry
-{
-    RegistryLevel              registry_level { 0 };
-    std::unique_ptr<IVectorTile>  tile_object { nullptr };
-};
-
-struct RegistryTileTypeTableEntry
-{
-    RegistryTileType           tile_type { RegistryTileType::Tile };
-    RegistryTileName           tile_name {};
-    RegistryIdSpace            registry_idspace { 0 };
-    RegistryLevel              registry_level { 0 };
-};
-
-using RegistryTileTable      = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
-using RegistryTileTypeTable  = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
-    /**
-     * @brief Construct a new Gpu Tile Registry object
-     *
-     */
-    GpuTileRegistry()
-    {
-        _language = GpuTargetLanguage::Unknown;
-    }
-    /**
-     * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuTileRegistry(GpuTargetLanguage language)
-    {
-        _language = language;
-    }
-    /**
-     * @brief Default destructor. Destroy the Gpu Tile Registry object
-     *
-     */
-    ~GpuTileRegistry() = default;
-    /**
-     * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
-     *        Therefore, the IdSpace should be set before declaring any tiles.
-     *
-     * @param[in] id The IdSpace id
-     */
-    void set_IdSpace(int32_t id)
-    {
-        _IdSpace = id;
-    }
-    /**
-     * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
-     *
-     * @return The IdSpace id
-     */
-    int32_t IdSpace() const
-    {
-        return _IdSpace;
-    }
-    /**
-     * @brief Gets all the IdSpace declarations defined in the tile registry.
-     *
-     * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
-     */
-    std::vector<int32_t> IdSpace_declarations() const
-    {
-        std::vector<int32_t> x;
-
-        auto it = _frags.begin();
-
-        while (it != _frags.end())
-        {
-            x.push_back(it->first);
-
-            it++;
-        }
-
-        return x;
-    }
-    /**
-     * @brief Declare a tile from a previously created tile
-     */
-    void insert(const std::string& name, const IVectorTile *frag)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace   = _IdSpace;
-        const std::string key_var_name  = name;
-        const std::string var_name      = frag->name();
-        TileInfo        format        = frag->format();
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if(result == nullptr)
-        {
-            std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
-
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Link;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-    /**
-     * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
-     *
-     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
-     *
-     * @param[in] name   Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
-     * @param[in] format Tile format use to use
-     */
-    void insert(const std::string& name, const TileInfo& format)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace   = _IdSpace;
-        const std::string key_var_name  = name;
-        const std::string var_name      = generate_tile_name(name);
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if(result == nullptr)
-        {
-            std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-    /**
-     * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
-     *
-     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
-     *
-     * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
-     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
-     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
-     *                 that the data type is aligned with the content of the std::string.
-     */
-    void insert(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if(result == nullptr)
-        {
-            std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-    /**
-     * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
-     *
-     * @note This method can be used to declare temporary tiles that need to be accessed only once.
-     *
-     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
-     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
-     *                 that the data type is aligned with what passed with the std::string.
-     *
-     * @return IVectorTile* the anonymous constant tile
-     */
-    IVectorTile* insert(const std::vector<std::vector<std::string>>& in, DataType dt)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace = _IdSpace;
-        const std::string key_var_name  = "_" + std::to_string(_anonymous_frag_count++);
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if(result == nullptr)
-        {
-            std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-
-        return (*this)[key_var_name];
-    }
-    /**
-     * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
-     *
-     * @param[in] name         The name of the tile to retrieve
-     * @param[in] IdSpace The IdSpace id where to search the tile
-     *
-     * @return IVectorTile* The tile
-     */
-    IVectorTile* get(const std::string& name, int32_t IdSpace)
-    {
-        const int32_t     key_IdSpace = IdSpace;
-        const std::string key_var_name  = name;
-
-        IVectorTile* result = nullptr;
-        auto search_IdSpace = _frags.find(key_IdSpace);
-        if(search_IdSpace != _frags.end())
-        {
-            auto search_tile = _frags[key_IdSpace].find(key_var_name);
-            if(search_tile != _frags[key_IdSpace].end())
-            {
-                result = search_tile->second.tile_object.get();
-                assert(result != nullptr);
-            }
-        }
-
-        return result;
-    }
-    /**
-     * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
-     *
-     * @param[in] name The name of the tile to retrieve
-     *
-     * @return IVectorTile* The tile
-     */
-    IVectorTile* operator[](const std::string& name)
-    {
-        return get(name, _IdSpace);
-    }
-    /**
-     * @brief Check whether the tile in the in the IdSpace provided by the user exists
-     *
-     * @param[in] name         Name of the tile to search for
-     * @param[in] IdSpace The IdSpace id where to search the tile
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tile(const std::string& name, int32_t IdSpace) const
-    {
-        const int32_t     key_IdSpace = IdSpace;
-        const std::string key_var_name  = name;
-
-        // IVectorTile* result = nullptr;
-        auto search_IdSpace = _frags.find(key_IdSpace);
-
-        return search_IdSpace != _frags.end();
-    }
-    /**
-     * @brief Check whether the tile within the current IdSpace exists
-     *
-     * @param[in] name Name of the tile to search for
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tile(const std::string& name) const
-    {
-        return has_tile(name, _IdSpace);
-    }
-    /**
-     * @brief Get all the tiles declared within the IdSpace provided by the user
-     *
-     * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
-     *
-     * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
-     */
-    std::vector<IVectorTile*> tile_declarations(int32_t IdSpace)
-    {
-        std::vector<IVectorTile*> tiles;
-
-        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
-
-        while (it != _frag_types[IdSpace].end())
-        {
-            // The following line should be enabled. However, we cannot at this stage
-            // because it used to retrieve the output tile produced by each component.
-            // However, this method should NOT be used to retrieve the output tile
-            //if(it->second.tile_type == RegistryTileType::Tile)
-            {
-                tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
-            }
-            it++;
-        }
-
-        return tiles;
-    }
-    /**
-     * @brief Increase the level of stack.
-     *
-     */
-    void increment_registry_level()
-    {
-        _registry_level++;
-    }
-    /**
-     * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
-     *
-     */
-    void decrement_registry_level()
-    {
-        assert(_registry_level >= 0);
-
-        // Remove all variables in the local scope
-        std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
-
-        while (it != _frags[_IdSpace].end())
-        {
-            if (it->second.registry_level == _registry_level)
-            {
-                it = _frags[_IdSpace].erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-
-        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
-
-        while (it_type != _frag_types[_IdSpace].end())
-        {
-            if (it_type->second.registry_level == _registry_level)
-            {
-                it_type = _frag_types[_IdSpace].erase(it_type);
-            }
-            else
-            {
-                it_type++;
-            }
-        }
-
-        _registry_level--;
-    }
-    /**
-     * @brief Get the level of the stack
-     *
-     */
-    int32_t level() const
-    {
-        return _registry_level;
-    }
-
-private:
-    // This method ensures that the key is unique among different components
-    std::string generate_tile_name(const std::string& name)
-    {
-        assert(_IdSpace >= 0 );
-        if(_registry_level == 0)
-        {
-            return "_G" + std::to_string(_IdSpace) + "_" + name;
-        }
-        else
-        {
-            return name;
-        }
-    }
-    RegistryTileTable     _frags {};
-    RegistryTileTypeTable _frag_types {};
-    RegistryLevel         _registry_level { 0 };
-    RegistryIdSpace       _IdSpace { -1 };
-    int32_t               _anonymous_frag_count { 0 };    // Counter used to create the anonymous tiles
-    GpuTargetLanguage     _language { GpuTargetLanguage::Unknown }; // Gpu programming language
-};
-
-using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
-
-/**
- * @brief Data structure that contains the tensors consumed by the components.
- * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tensors among different components.
- *
- */
-class GpuTensorArgumentRegistry
-{
-public:
-    /**
-     * @brief Construct a new Gpu Tensor Registry object
-     *
-     */
-    GpuTensorArgumentRegistry()
-    {
-        _language = GpuTargetLanguage::Unknown;
-    }
-    /**
-     * @brief Construct a new Gpu Tensor Registry object
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuTensorArgumentRegistry(GpuTargetLanguage language)
-    {
-        _language = language;
-    }
-    /**
-     * @brief Default destructor. Destroy the Gpu Tensor Registry object
-     *
-     */
-    ~GpuTensorArgumentRegistry() = default;
-    /**
-     * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
-     *        Therefore, the IdSpace should be set before declaring any tensors.
-     *
-     * @param[in] id The IdSpace id
-     */
-    void set_IdSpace(int32_t id)
-    {
-        _IdSpace = id;
-    }
-    /**
-     * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
-     *
-     * @return The IdSpace id
-     */
-    int32_t IdSpace() const
-    {
-        return _IdSpace;
-    }
-    /**
-     * @brief Gets all the IdSpace declarations defined in the tensor registry.
-     *
-     * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
-     */
-    std::vector<int32_t> IdSpace_declarations() const
-    {
-        std::vector<int32_t> x;
-
-        auto it = _refs.begin();
-
-        while (it != _refs.end())
-        {
-            x.push_back(it->first);
-
-            it++;
-        }
-
-        return x;
-    }
-    /**
-     * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
-     *
-     * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
-     *
-     * @param[in] name                          Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
-     * @param[in] x                             Pair of tensor info and tensor id
-     * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
-     */
-    void insert(const std::string& name, const TensorInfo& x, bool return_by_value_when_possible)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace    = _IdSpace;
-        const int32_t     tensor_id      = x.id;
-        const std::string key_var_name   = name;
-        const std::string var_name       = generate_tensor_name(name, tensor_id);
-
-        // First, check whether the tensor has already a reference. If so, trigger an assert
-        assert(!has_tensor_argument(name));
-
-        // Check whether a tensor with that tensorID exists
-        auto result = _tensor_arguments.find(tensor_id);
-        if(result == _tensor_arguments.end())
-        {
-            // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
-            std::unique_ptr<ClTensorArgument> arg = std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
-            _tensor_arguments[tensor_id] = std::move(arg);
-        }
-
-        _refs[key_IdSpace][key_var_name] = tensor_id;
-    }
-    /**
-     * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
-     *
-     * @param[in] name The name of the tensor to retrieve
-     *
-     * @return IGpuTensor* The tensor
-     */
-    IGpuTensorArgument* operator[](const std::string& name)
-    {
-        const int32_t     key_IdSpace = _IdSpace;
-        const std::string key_var_name  = name;
-
-        IGpuTensorArgument* result = nullptr;
-        auto search_IdSpace = _refs.find(key_IdSpace);
-        if(search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            if(search_tensor_id != _refs[key_IdSpace].end())
-            {
-                const int32_t tensor_id = search_tensor_id->second;
-                auto search_tensor_argument = _tensor_arguments.find(tensor_id);
-                if(search_tensor_argument != _tensor_arguments.end())
-                {
-                    result = search_tensor_argument->second.get();
-                }
-                assert(result != nullptr);
-            }
-        }
-
-        return result;
-    }
-    /**
-     * @brief Get all the tensors declared in the IdSpace provided by the user
-     *
-     * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
-     */
-    std::vector<IGpuTensorArgument*> tensor_argument_declarations()
-    {
-        std::vector<IGpuTensorArgument*> args;
-
-        auto it = _tensor_arguments.begin();
-
-        while (it != _tensor_arguments.end())
-        {
-            args.push_back(it->second.get());
-            it++;
-        }
-
-        return args;
-    }
-    /**
-     * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
-     *
-     * @param[in] name Name of the tensor argument to search for
-     *
-     * @return true if the tensor argument exists
-     * @return false if the tensor argument does not exist
-     */
-    bool has_tensor_argument(const std::string& name)
-    {
-        const int32_t     key_IdSpace = _IdSpace;
-        const std::string key_var_name  = name;
-
-        auto search_IdSpace = _refs.find(key_IdSpace);
-
-        if(search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            return search_tensor_id != _refs[key_IdSpace].end();
-        }
-        else
-        {
-            return false;
-        }
-    }
-    /**
-     * @brief Check whether the tensor argument is in the the IdSpace provided by the user
-     *
-     * @param[in] name    Name of the tensor argument to search for
-     * @param[in] IdSpace The IdSpace id where to search the tensor argument
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tensor_argument(const std::string& name, int32_t IdSpace)
-    {
-        const int32_t     key_IdSpace = IdSpace;
-        const std::string key_var_name  = name;
-
-        auto search_IdSpace = _refs.find(key_IdSpace);
-
-        if(search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            return search_tensor_id != _refs[key_IdSpace].end();
-        }
-        else
-        {
-            return false;
-        }
-    }
-private:
-    // This method ensures that the key is unique among different components
-    std::string generate_tensor_name(const std::string& name, int32_t tensor_id)
-    {
-        assert(tensor_id >= 0 );
-
-        return name + std::to_string(tensor_id);
-    }
-
-    std::map<int32_t, TensorEntry>                    _tensor_arguments {};
-    std::map<int32_t, std::map<std::string, int32_t>> _refs {};
-    int32_t                                           _IdSpace { -1 };
-    GpuTargetLanguage                                 _language { GpuTargetLanguage::Unknown }; // Gpu programming language
-};
-
-enum class OpType : int32_t
-{
-    Elementwise = 0x0000,
-    Relational  = 0x1000,
-    Algebra     = 0x2000
-};
-
-inline std::string to_string(AssignmentOp op)
-{
-    switch(op)
-    {
-        case AssignmentOp::Decrement:
-            return "-=";
-        case AssignmentOp::Increment:
-            return "+=";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string to_string(BinaryOp op)
-{
-    switch(op)
-    {
-        case BinaryOp::Add:
-            return "+";
-        case BinaryOp::Sub:
-            return "-";
-        case BinaryOp::Mul:
-            return "*";
-        case BinaryOp::Div:
-            return "/";
-        case BinaryOp::Mod:
-            return "%";
-        case BinaryOp::Equal:
-            return "==";
-        case BinaryOp::Less:
-            return "<";
-        case BinaryOp::LessEqual:
-            return "<=";
-        case BinaryOp::Greater:
-            return ">";
-        case BinaryOp::GreaterEqual:
-            return ">=";
-        case BinaryOp::LogicalAnd:
-            return "&&";
-        case BinaryOp::LogicalOr:
-            return "||";
-        case BinaryOp::LogicalNot:
-            return "!";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string binary_op_string(BinaryOp op)
-{
-    switch(op)
-    {
-        case BinaryOp::Add:
-            return "add";
-        case BinaryOp::Sub:
-            return "sub";
-        case BinaryOp::Mul:
-            return "mul";
-        case BinaryOp::Div:
-            return "div";
-        case BinaryOp::Mod:
-            return "mod";
-        case BinaryOp::Equal:
-            return "eq";
-        case BinaryOp::Less:
-            return "gt";
-        case BinaryOp::LessEqual:
-            return "gteq";
-        case BinaryOp::Greater:
-            return "lt";
-        case BinaryOp::GreaterEqual:
-            return "lte";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-enum class OperandType : int32_t
-{
-    Unknown              = 0x00000000,
-    ScalarFp32           = 0x00001011, // Immediate scalar tile
-    ScalarFp16           = 0x00001012, // Immediate scalar tile
-    ScalarInt32          = 0x00001021, // Immediate scalar tile
-    ScalarInt16          = 0x00001022, // Immediate scalar tile
-    ScalarInt8           = 0x00001024, // Immediate scalar tile
-    ScalarUInt32         = 0x00001031, // Immediate scalar tile
-    ScalarUInt16         = 0x00001032, // Immediate scalar tile
-    ScalarUInt8          = 0x00001034, // Immediate scalar tile
-    ScalarBool           = 0x00001041, // Immediate scalar tile
-    ScalarTile           = 0x00001050, // Scalar from a tile
-    Tile                 = 0x00010000, // Tile
-    TensorStride1        = 0x00100001, // Tensor component
-    TensorStride2        = 0x00100002, // Tensor component
-    TensorStride3        = 0x00100003, // Tensor component
-    TensorStride4        = 0x00100004, // Tensor component
-    TensorDim0           = 0x00100010, // Tensor component
-    TensorDim1           = 0x00100020, // Tensor component
-    TensorDim2           = 0x00100030, // Tensor component
-    TensorDim3           = 0x00100040, // Tensor component
-    TensorDim4           = 0x00100050, // Tensor component
-    TensorC              = 0x00100010, // Tensor component
-    TensorW              = 0x00100020, // Tensor component
-    TensorH              = 0x00100030, // Tensor component
-    TensorD              = 0x00100040, // Tensor component
-    TensorN              = 0x00100050, // Tensor component
-    TensorDim1xDim2      = 0x00100100, // Tensor component
-    TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
-    TensorWxH            = 0x00100300, // Tensor component
-    TensorWxHxD          = 0x00100400, // Tensor component
-    TensorDataOffset     = 0x00100500, // Tensor component
-};
-
-struct ScalarTileCoord
-{
-    ScalarTileCoord() {}
-    ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) {}
-    int32_t x { -1 };
-    int32_t y { -1 };
-};
-/**
- * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
- * Operand can be of three types:
- * -# Scalar immediate: constant expression
- * -# Tile: A tile
- * -# Tensor component: A component (scalar) of a tensor
- *
- */
-class Operand
-{
-public:
-    Operand(const std::string &val)
-    {
-        _str  = val;
-        _type = OperandType::Tile;
-    }
-
-    Operand(const std::string &val, const ScalarTileCoord& coord)
-    {
-        _str   = val;
-        _type  = OperandType::ScalarTile;
-        _coord = coord;
-    }
-
-    Operand(const std::string &val, OperandType type)
-    {
-        _str  = val;
-        _type = type;
-    }
-
-    Operand(const Operand& t)
-    {
-        _str  = t.value();
-        _type = t.type();
-    }
-
-    Operand& operator=(const Operand& t)
-    {
-        _str   = t.value();
-        _type  = t.type();
-        _coord = t.scalar_tile_coordinate();
-        return *this;
-    }
-
-    std::string value() const
-    {
-        return _str;
-    }
-
-    OperandType type() const
-    {
-        return _type;
-    }
-
-    ScalarTileCoord scalar_tile_coordinate() const
-    {
-        return _coord;
-    }
-
-private:
-    std::string     _str {};
-    OperandType     _type { OperandType::Unknown };
-    ScalarTileCoord _coord {};
-};
-
-enum class GpuSamplerTensorStorage : int32_t
-{
-    Unknown          = static_cast<int32_t>(GpuTensorStorage::Unknown),
-    BufferUint8Ptr   = static_cast<int32_t>(GpuTensorStorage::BufferUint8Ptr),
-    Image2dReadOnly  = static_cast<int32_t>(GpuTensorStorage::Image2dReadOnly),
-    Image2dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
-    Image3dReadOnly  = static_cast<int32_t>(GpuTensorStorage::Image3dReadOnly),
-    Image3dWriteOnly = static_cast<int32_t>(GpuTensorStorage::Image2dWriteOnly),
-};
-
-struct GpuSampler
-{
-    GpuSampler() = default;
-    TensorSamplerFormat  format { TensorSamplerFormat::Unknown };
-    GpuSamplerTensorStorage storage { GpuSamplerTensorStorage::Unknown };
-    TensorSamplerAddressModeX  address_mode_x { TensorSamplerAddressModeX::Unknown };
-    TensorSamplerAddressModeY  address_mode_y { TensorSamplerAddressModeY::Unknown };
-    TensorSamplerAddressModeZ  address_mode_z { TensorSamplerAddressModeZ::Unknown };
-};
-
-inline GpuSampler create_simple_sampler(const TensorInfo* tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
-{
-    CKW_UNUSED(step_x, step_y, step_z);
-
-    auto tensor = tensor_info_id->shape;
-
-    GpuSampler dst_sampler;
-    dst_sampler.format         = sampler.format;
-    dst_sampler.storage        = GpuSamplerTensorStorage::BufferUint8Ptr;
-    dst_sampler.address_mode_x = sampler.address_mode_x;
-    dst_sampler.address_mode_y = sampler.address_mode_y;
-    dst_sampler.address_mode_z = sampler.address_mode_z;
-
-    int32_t dim_x = 0;
-    int32_t dim_y = 0;
-    int32_t dim_z = 0;
-
-    switch(sampler.format)
-    {
-        case TensorSamplerFormat::C_W_H:
-        dim_x = tensor[0];
-        dim_y = tensor[1];
-        dim_z = tensor[2];
-        break;
-        case TensorSamplerFormat::C_WH_1:
-        dim_x = tensor[0];
-        dim_y = tensor[1] * tensor[2];
-        dim_z = 1;
-        break;
-        default:
-        std::cout << "Unsupported tensor format" << std::endl;
-        assert(false);
-        break;
-    }
-
-    if(dim_x == 1)
-    {
-        assert(step_x == 1);
-        dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
-    }
-
-    if(dim_y == 1)
-    {
-        assert(step_y == 1);
-        dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
-    }
-
-    if(dim_z == 1)
-    {
-        assert(step_z == 1);
-        dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-    }
-
-    return dst_sampler;
-}
-
-class GpuOutputSampler
-{
-public:
-    GpuOutputSampler() = default;
-    /**
-     * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
-     *        by the root component. Once initialized, all simpler components will need to used this sampler
-     *        or a broadcasted version of it
-     *
-     * @param[in] sampler GpuSampler
-     * @param[in] step_x  Increment step in the X direction. Not necessarily it is the same of n0 of tile!
-     * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
-     * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
-     */
-    void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z)
-    {
-        assert(_is_initialized == false);
-
-        _step_x         = step_x;
-        _step_y         = step_y;
-        _step_z         = step_z;
-        _tensor_info_id = tensor_info_id;
-        _sampler        = create_sampler(tensor_storage, tensor_format);
-        _is_initialized = true;
-    };
-
-    GpuSampler sampler() const
-    {
-        return _sampler;
-    };
-
-    int32_t step_x() const
-    {
-        return _step_x;
-    };
-
-    int32_t step_y() const
-    {
-        return _step_y;
-    };
-
-    int32_t step_z() const
-    {
-        return _step_z;
-    };
-private:
-    GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
-    {
-        // Output can only be in output mode
-        assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
-        assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
-
-        auto tensor = _tensor_info_id->shape;
-
-        GpuSampler sampler;
-        sampler.format         = tensor_format;
-        sampler.storage        = tensor_storage;
-        sampler.address_mode_x = TensorSamplerAddressModeX::None;
-        sampler.address_mode_y = TensorSamplerAddressModeY::None;
-        sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-
-        // In the case of texture, we do not need any special checks at the border
-        if(tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
-        {
-            int32_t dim_x = 0;
-            int32_t dim_y = 0;
-            int32_t dim_z = 0;
-
-            switch(tensor_format)
-            {
-                case TensorSamplerFormat::C_W_H:
-                dim_x = tensor[0];
-                dim_y = tensor[1];
-                dim_z = tensor[2];
-                break;
-                case TensorSamplerFormat::C_WH_1:
-                dim_x = tensor[0];
-                dim_y = tensor[1] * tensor[2];
-                dim_z = 1;
-                break;
-                default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                break;
-            }
-
-            if((dim_x % _step_x) != 0 && dim_x != 1)
-            {
-                sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
-            }
-
-            if((dim_y % _step_y) != 0 && dim_y != 1)
-            {
-                sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
-            }
-
-            if((dim_z % _step_z) != 0 && dim_z != 1)
-            {
-                sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
-            }
-        }
-
-        return sampler;
-    }
-    GpuSampler              _sampler { }; // GpuSampler
-    int32_t                 _step_x { 1 };
-    int32_t                 _step_y { 1 };
-    int32_t                 _step_z { 1 };
-    const TensorInfo* _tensor_info_id { nullptr };
-    bool                    _is_initialized { false };
-};
-
-/**
- * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
- */
-class TensorOperand
-{
-public:
-    TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
-    {
-    }
-
-    TensorOperand& operator=(const TensorOperand& t)
-    {
-        _str     = t.value();
-        _sampler = t.sampler();
-        return *this;
-    }
-
-    std::string value() const
-    {
-        return _str;
-    }
-
-    GpuSampler sampler() const
-    {
-        return _sampler;
-    }
-
-private:
-    std::string _str {};
-    GpuSampler  _sampler {};
-};
-
-/**
- * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
- *        This data structure must be initialized before being passed to the Gpu Kernel Writer
- *
- */
-class GpuKernelWriterDataHolder
-{
-public:
-    /**
-     * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
-     *        the GPU target and target specific capabilities (extensions). For now, we just initialize the
-     *        programming language
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language)
-    {
-    }
-    /**
-     * @brief Get the Gpu programming language used
-     *
-     * @return GpuTargetLanguage the Gpu programming language
-     */
-    GpuTargetLanguage programming_language() const
-    {
-        return _language;
-    }
-    /**
-     * @brief @ref GpuTileRegistry
-     *
-     */
-    GpuTileRegistry tiles{};
-    /**
-     * @brief @ref GpuTensorArgumentRegistry
-     *
-     */
-    GpuTensorArgumentRegistry arguments{};
-    /**
-     * @brief @ref GpuOutputSampler.
-     *
-     */
-    GpuOutputSampler output_sampler{};
-    /**
-     * @brief Source code
-     *
-     */
-    std::string code{};
-
-    // GpuExtensionRegistry extensions{};
-private:
-    GpuTargetLanguage _language;
-};
-
-struct LWS
-{
-    int32_t x {1};
-    int32_t y {1};
-    int32_t z {1};
-};
-
-/**
- * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
- *        declare an anonymous tile in the tile registry.
- */
-class OperandUnpacker
-{
-public:
-    OperandUnpacker(GpuTileRegistry& tiles, GpuTensorArgumentRegistry& arguments) : _tiles(tiles), _arguments(arguments)
-    {
-        // Increase the level of the stack to allocate possible temporary tiles
-        _tiles.increment_registry_level();
-    };
-
-    ~OperandUnpacker()
-    {
-        // Decrease the level of the stack to deallocate any temporary tiles
-        _tiles.decrement_registry_level();
-    }
-
-    IVectorTile* unpack(const Operand& src)
-    {
-        // Get the tile
-        if(src.type() == OperandType::Tile)
-        {
-            assert(_tiles.has_tile(src.value()));
-            return _tiles[src.value()];
-        }
-        // Create an anonymous tile with a constant
-        else if(static_cast<int32_t>(src.type()) & 0x00001000)
-        {
-            if(src.type() == OperandType::ScalarTile)
-            {
-                ScalarTileCoord coord = src.scalar_tile_coordinate();
-                assert(_tiles.has_tile(src.value()));
-                assert(coord.x >= 0);
-                assert(coord.y >= 0);
-                auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
-                return _tiles.insert({{{val.str}}}, val.type.dt);
-            }
-            else
-            {
-                return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
-            }
-        }
-        // Create an anonymous tile with the tensor component
-        else
-        {
-            assert(_arguments.has_tensor_argument(src.value()));
-            auto x = _arguments[src.value()];
-            const std::string val = x->component(to_tensor_component(src.type()));
-            const DataType    dt  = x->component_data_type();
-            return _tiles.insert({{{val}}}, dt);
-        }
-    }
-
-private:
-    DataType to_tile_data_type(OperandType x)
-    {
-        return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
-    }
-
-    TensorComponent to_tensor_component(OperandType x)
-    {
-        switch(x)
-        {
-            case OperandType::TensorDim0:
-            return TensorComponent::Dim0;
-            case OperandType::TensorDim1:
-            return TensorComponent::Dim1;
-            case OperandType::TensorDim2:
-            return TensorComponent::Dim2;
-            case OperandType::TensorDim3:
-            return TensorComponent::Dim3;
-            case OperandType::TensorDim4:
-            return TensorComponent::Dim4;
-            case OperandType::TensorStride1:
-            return TensorComponent::Stride1;
-            case OperandType::TensorStride2:
-            return TensorComponent::Stride2;
-            case OperandType::TensorStride3:
-            return TensorComponent::Stride3;
-            case OperandType::TensorStride4:
-            return TensorComponent::Stride4;
-            case OperandType::TensorDim1xDim2:
-            return TensorComponent::Dim1xDim2;
-            case OperandType::TensorDim1xDim2xDim3:
-            return TensorComponent::Dim1xDim2xDim3;
-            case OperandType::TensorDataOffset:
-            return TensorComponent::OffsetFirstElement;
-            default:
-            assert(false);
-            return TensorComponent::Unknown;
-        }
-    }
-
-    GpuTileRegistry&       _tiles;
-    GpuTensorArgumentRegistry& _arguments;
-};
-
-/**
- * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
- *        declare an anonymous tile in the tile registry.
- *        Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
- */
-class TensorOperandUnpacker
-{
-public:
-    TensorOperandUnpacker(GpuTensorArgumentRegistry& arguments) : _arguments(arguments)
-    {
-    };
-
-    IGpuTensorArgument* unpack(const TensorOperand& src)
-    {
-        assert(_arguments.has_tensor_argument(src.value()));
-        return _arguments[src.value()];
-    }
-
-private:
-    GpuTensorArgumentRegistry& _arguments;
-};
-
-/**
- * @brief The GpuKernel will be used in three occasions (stages):
- * #- Compilation stage
- * #- Tuning stage
- * #- Dispatch stage
- */
-struct GpuKernel
-{
-    // Compilation stage
-    std::string                                       code {};               // Source code, required for the compilation stage
-    std::vector<GpuExtensions>                        list_extensions{};     // Extensions, required for the compilation stage
-    // Tuning stage
-    std::string                                       config_id {};          // Unique id, required for the tuning stage
-    std::vector<LWS>                                  list_lws{};            // LWS to test, required for the tuning stage
-    // Dispatch stage
-    GpuOutputSampler                                  output_sampler{};      // GpuOutputSampler, required for the dispatch stage
-    std::vector<std::pair<int32_t, GpuTensorStorage>> list_tensor_storages;  // List of tensor storages, required for the dispatch stage
-    std::vector<std::pair<int32_t, TensorComponent>>  list_tensor_components;// List of tensor components (width, stride,..), required for the dispatch stage)
-};
-
-// This function should produce an object with the source
-inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string& name)
-{
-    std::string code;
-    code += "__kernel void ";
-    code += name;
-    code += "(\n";
-
-    auto IdSpaces = in.arguments.IdSpace_declarations();
-
-    std::vector<std::string> arg_str;
-
-    auto tensor_args = in.arguments.tensor_argument_declarations();
-
-    for(auto &i : tensor_args)
-    {
-        // For each tensor used, get the storage and tensor components
-        auto storages   = i->storage_declarations();
-        auto components = i->component_declarations();
-
-        for(auto &y : storages)
-        {
-            std::string str;
-            str += i->storage_type_declaration(y);
-            str += " ";
-            str += i->storage(y);
-            arg_str.push_back(str);
-        }
-
-        for(auto &y : components)
-        {
-            std::string str;
-            str += i->component_type_declaration();
-            str += " ";
-            str += i->component(y);
-            arg_str.push_back(str);
-        }
-    }
-
-    for(size_t i = 0; i < arg_str.size(); ++i)
-    {
-        code += arg_str[i];
-        if(i + 1 < arg_str.size())
-        {
-            code += ",\n";
-        }
-    }
-
-    code += ")\n";
-    code += "{\n";
-    code += in.code;
-    code += "}\n";
-
-    return code;
-}
-
-/**
- * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
- * how to reduce the dimensionality of a tensor
- *
- */
-class GpuTensor3dMapper
-{
-public:
-    GpuTensor3dMapper(IGpuTensorArgument* tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor)
-    {
-    };
-
-    std::string tensor_component_x() const
-    {
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponent::C);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_y() const
-    {
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return _tensor->component(TensorComponent::WxH);
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponent::W);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_z() const
-    {
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return "1";
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponent::H);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_y() const
-    {
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponent::Stride1);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_z() const
-    {
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return "0";
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponent::Stride2);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_batch() const
-    {
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponent::Stride3);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_x() const
-    {
-        auto t = _tensor->format();
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[0] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_y() const
-    {
-        auto t = _tensor->format();
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return (t.shape[1] * t.shape[2]) == 1;
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[1] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_z() const
-    {
-        auto t = _tensor->format();
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return true;
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[2] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_batch() const
-    {
-        auto t = _tensor->format();
-        const auto format = _sampler.format;
-        switch(format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[3] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    GpuSampler gpu_sampler() const
-    {
-        return _sampler;
-    }
-
-    IGpuTensorArgument* tensor_argument() const
-    {
-        return _tensor;
-    }
-
-private:
-    GpuSampler          _sampler;
-    IGpuTensorArgument* _tensor;
-};
-
-struct GpuKernelWriterAttribute
-{
-    bool return_tensor_component_by_value { false };
-};
-
-enum class ConvertPolicy
-{
-    Wrap,    /**< Wrap around */
-    Saturate /**< Saturate */
-};
-
-enum class RoundingMode
-{
-    None,
-    Rte,
-    Rtz,
-    Rtp,
-    Rtn
-};
-
-// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
-class IGpuKernelWriter
-{
-public:
-    virtual ~IGpuKernelWriter() = default;
-    virtual void set_IdSpace(int32_t id) = 0;
-    virtual void import_tile(const std::string& dst, const IVectorTile *src) = 0;
-    virtual void declare_argument(const std::string& name, const TensorInfo& tensor) = 0;
-    virtual void declare_tile(const std::string& name, const TileInfo& info) = 0;
-    virtual void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) = 0;
-    virtual void write_text(const std::string& x) = 0;
-    virtual void compound_statement_begin() = 0;
-    virtual void compound_statement_end() = 0;
-
-    // Operations
-    virtual void op_get_global_id(const Operand& dst_var, int32_t dim) = 0;
-    virtual void op_get_global_coord(const Operand& dst, const Operand& step, const TensorOperand& tensor, int32_t dim) = 0;
-    virtual void op_get_global_batch(const Operand& dst, const TensorOperand& tensor) = 0;
-    virtual void op_get_global_size(const Operand& dst_var, int32_t dim) = 0;
-    virtual void op_binary_expression(const Operand& dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-    virtual void op_assign(const Operand& dst_name, const Operand& src_name) = 0;
-    virtual void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) = 0;
-    virtual void op_if(const Operand& lhs, BinaryOp op, const Operand& rhs) = 0;
-    virtual void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value, AssignmentOp update_op, const Operand& update_value) = 0;
-    virtual void op_load_indirect(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y_indirect, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
-    virtual void op_load_immediate(const TensorOperand& tensor, const Operand& dst, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32), const Operand& dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
-    virtual void op_store_immediate(const TensorOperand& tensor, const Operand& src, const Operand& x, const Operand& y, const Operand& z, const Operand& b = Operand("0", OperandType::ScalarInt32)) = 0;
-    virtual void op_cast_expression(const Operand& dst, const Operand &src, ConvertPolicy policy) = 0;
-    virtual void op_return() = 0;
-    // virtual void op_else() = 0;
-    // virtual void op_elseif() = 0;
-    // Utils
-    // It is the process of converting
-    virtual void util_get_indirect_buffer(const Operand& dst, const TensorOperand& tensor, const Operand& x, const Operand& y, const Operand& x_off, const Operand& y_off) = 0;
-};
-
-enum class GpuLoadStoreType
-{
-    Load  = 1,
-    Store = 2
-};
-
-class IGpuLoadStoreHelperWriter
-{
-public:
-    IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) {}
-    IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
-    IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
-    virtual ~IGpuLoadStoreHelperWriter() = default;
-    virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
-    virtual void write(const std::pair<int32_t, std::string>& y) = 0;
-    virtual void finalize() = 0;
-protected:
-    IGpuKernelWriter* _writer;
-    GpuTensor3dMapper _mapper;
-    GpuLoadStoreType  _type;
-};
-
-class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
-    ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
-    {
-    }
-
-    ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
-    ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
-
-    static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
-    {
-        CKW_UNUSED(x, type, dst);
-
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
-        {
-            return false;
-        }
-        return true;
-    }
-
-    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
-    {
-        assert(validate(_writer, _mapper, _type, dst));
-
-        _dst           = dst;
-        _ls_width_full = dst->format().w;
-
-        _coord_x = x->scalar(0, 0).str;
-        _coord_z = z->scalar(0, 0).str;
-        _coord_b = b->scalar(0, 0).str;
-        _coord_orig_z = _coord_z;
-
-        out_of_bound_initialize_x(_coord_x);
-        out_of_bound_initialize_z(_coord_z);
-
-        /*
-        meaning of else:
-        - x: partial load/store
-        - y: no load/store operation
-        - z: no load/store operation
-        if(x)
-        {
-            if(z)
-            {
-                if(y)
-                {
-                    // full load/store width
-                }
-                else
-                {
-                    // no load/store
-                }
-            }
-            else
-            {
-                // no load/store
-            }
-        }
-        else
-        {
-            if(z)
-            {
-                if(y)
-                {
-                    // partial load/store width
-                }
-                else
-                {
-                    // no load/store
-                }
-            }
-            else
-            {
-                // no load/store
-            }
-        }
-        */
-    }
-
-    void write(const std::pair<int32_t, std::string>& y) override
-    {
-        int32_t     idx_y   = y.first;
-        std::string coord_y = y.second;
-
-        // The only check required is on Y.
-        out_of_bound_initialize_y(coord_y);
-
-        const std::string dst     = _dst->vector(idx_y).str;
-        const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
-        const std::string ls_buf  = to_ls_buffer(_type, _ls_width_full, dst, address);
-
-        _writer->write_text(ls_buf);
-        _writer->write_text(";\n");
-
-        out_of_bound_finalize_y(dst);
-
-        // The left over load/store will be written in the finalize stage
-        if(_ls_width_part.size() != 0)
-        {
-            int32_t w = 0;
-            for(auto &p : _ls_width_part)
-            {
-                const std::string dst0    = _dst->vector(w, p, idx_y).str;
-                const std::string coord_x = _coord_x + " + " + std::to_string(w);
-                const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
-                const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
-                _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
-
-                w += p;
-            }
-        }
-    }
-
-    void finalize() override
-    {
-        out_of_bound_finalize_z();
-        out_of_bound_finalize_x();
-    }
-private:
-    IVectorTile*            _dst { nullptr };
-    int32_t                  _ls_width_full { 0 };
-    std::vector<int32_t>     _ls_width_part { };
-    std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x {};
-    std::string              _coord_x {};
-    std::string              _coord_z {};
-    std::string              _coord_orig_z {};
-    std::string              _coord_b {};
-
-    void out_of_bound_initialize_x(std::string& coord)
-    {
-        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-        {
-            auto tensor_format = _mapper.tensor_argument()->format();
-            auto shape         = tensor_format.shape;
-
-            _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
-            if(_ls_width_part.size() != 0)
-            {
-                _writer->write_text("if(" + coord + " > 0)\n");
-                _writer->compound_statement_begin();
-            }
-        }
-    };
-
-    void out_of_bound_finalize_x()
-    {
-        if(_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-        {
-            if(_ls_width_part.size() != 0)
-            {
-                _writer->compound_statement_end();
-                _writer->write_text("else\n");
-                _writer->compound_statement_begin();
-
-                out_of_bound_initialize_z(_coord_orig_z);
-                for(auto &i : _leftovers_x)
-                {
-                    out_of_bound_initialize_y(i.first.second);
-                    _writer->write_text(i.second);
-                    _writer->write_text(";\n");
-                    out_of_bound_finalize_y(i.first.first);
-                }
-                out_of_bound_finalize_z();
-                _writer->compound_statement_end();
-            }
-        }
-    };
-
-    void out_of_bound_initialize_y(std::string& coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch(address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::ClampToBorder:
-            // NOTE: This line should not be moved outside of the switch statement.
-            // The reason for that is because when we query the component, the component is marked as used
-            // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
-            // we should request the component only when used
-            max = _mapper.tensor_component_y();
-            _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            _writer->write_text("if(" + coord + " >= 0)\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            max = _mapper.tensor_component_y();
-            _writer->write_text("if(" + coord + " < " + max + ")\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeY::ClampToNearest:
-            max = _mapper.tensor_component_y();
-            coord = "clamp(" + coord + ", 0, " + max + " - 1)";
-            break;
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            max = _mapper.tensor_component_y();
-            coord = "min(" + coord + ", " + max + " - 1)";
-            break;
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-            coord = "max(" + coord + ", 0)";
-            break;
-            case TensorSamplerAddressModeY::None:
-            break;
-            default:
-            std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
-            assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_y(const std::string& dst)
-    {
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch(address_mode_y)
-        {
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            _writer->compound_statement_end();
-            break;
-
-            default:
-                assert(false);
-        }
-
-        switch(address_mode_y)
-        {
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            _writer->write_text("else\n");
-            _writer->compound_statement_begin();
-            _writer->write_text(dst);
-            _writer->write_text(" = 0.0f;\n");
-            _writer->compound_statement_end();
-            break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    void out_of_bound_initialize_z(std::string& coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
-        switch(address_mode_z)
-        {
-            case TensorSamplerAddressModeZ::Skip:
-            max = _mapper.tensor_component_z();
-            _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
-            _writer->write_text("if(" + coord + " >= 0)\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
-            max = _mapper.tensor_component_z();
-            _writer->write_text("if(" + coord + " < " + max + ")\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeZ::ClampToNearest:
-            max = _mapper.tensor_component_z();
-            coord = "clamp(" + coord + ", 0, " + max + " - 1)";
-            break;
-            case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
-            max = _mapper.tensor_component_z();
-            coord = "min(" + coord + ", " + max + " - 1)";
-            break;
-            case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
-            coord = "max(" + coord + ", 0)";
-            break;
-            case TensorSamplerAddressModeZ::None:
-            break;
-            default:
-            std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
-            assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_z()
-    {
-        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
-        switch(address_mode_z)
-        {
-            case TensorSamplerAddressModeZ::Skip:
-            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
-            _writer->compound_statement_end();
-            break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
-    {
-        std::vector<int32_t> x;
-
-        switch(ls_leftover_vector_width)
-        {
-            case 0:
-                break;
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 8:
-            case 16:
-                x.push_back(ls_leftover_vector_width);
-                break;
-            case 5:
-                x.push_back(4);
-                x.push_back(1);
-                break;
-            case 6:
-                x.push_back(4);
-                x.push_back(2);
-                break;
-            case 7:
-                x.push_back(4);
-                x.push_back(3);
-                break;
-            case 9:
-                x.push_back(8);
-                x.push_back(1);
-                break;
-            case 10:
-                x.push_back(8);
-                x.push_back(2);
-                break;
-            case 11:
-                x.push_back(8);
-                x.push_back(3);
-                break;
-            case 12:
-                x.push_back(8);
-                x.push_back(4);
-                break;
-            case 13:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(1);
-                break;
-            case 14:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(2);
-                break;
-            case 15:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(3);
-                break;
-
-            default:
-                assert(false);
-        }
-        return x;
-    }
-
-    std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& address)
-    {
-        switch(type)
-        {
-            case GpuLoadStoreType::Load:
-            if(vector_width != 1)
-            {
-                return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
-            }
-            else
-            {
-                return data + " = *(" + address + ")";
-            }
-            break;
-            case GpuLoadStoreType::Store:
-            if(vector_width != 1)
-            {
-                return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
-            }
-            else
-            {
-                return "*(" + address + ") = " + data;
-            }
-            break;
-            default:
-            std::cout << "Unsupported GpuLoadStoreType" << std::endl;
-            assert(false);
-            return "";
-        }
-    }
-
-    std::string to_ls_buffer_address(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
-    {
-        auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
-        assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
-        const std::string ptr_buf      = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string dst_type     = get_cl_data_type(_dst->format().dt, 1);
-
-        std::string address;
-        address += "(__global ";
-        address += dst_type;
-        address += "*)(";
-        address += ptr_buf;
-        if(x != "0" && (_mapper.is_one_component_x() != true))
-        {
-            address += " + (";
-            address += x + ") * sizeof(" + dst_type + ")";
-        }
-        if(y != "0" && (_mapper.is_one_component_y() != true))
-        {
-            const std::string stride_y = _mapper.tensor_component_stride_y();
-            address += " + (";
-            address += y + ")";
-            address += " * ";
-            address += stride_y;
-        }
-        if(z != "0" && (_mapper.is_one_component_z() != true))
-        {
-            const std::string stride_z = _mapper.tensor_component_stride_z();
-            address += " + (";
-            address += z + ")";
-            address += " * ";
-            address += stride_z;
-        }
-        if(b != "0" && (_mapper.is_one_component_batch() != true))
-        {
-            const std::string stride_b = _mapper.tensor_component_stride_batch();
-            address += " + (";
-            address += b + ")";
-            address += " * ";
-            address += stride_b;
-        }
-        address += ")";
-        return address;
-    }
-};
-
-class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
-    static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type, IVectorTile *dst)
-    {
-        CKW_UNUSED(x);
-
-        if(dst->format().w != 4)
-        {
-            return false;
-        }
-        if(mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
-        {
-            return false;
-        }
-        if(mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
-        {
-            return false;
-        }
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
-        {
-            return false;
-        }
-        if(mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store)
-        {
-            return false;
-        }
-        if((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
-        {
-            return false;
-        }
-        return true;
-        /*
-        - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
-        - z: Only GpuSamplerAddressModeZ::None is supported
-        */
-    }
-    ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type)
-    {
-    }
-
-    ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
-    ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
-
-    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
-    {
-        assert(validate(_writer, _mapper, _type, dst));
-
-        _dst           = dst;
-        _ls_width_full = dst->format().w;
-        _coord_x       = x->scalar(0, 0).str;
-        _coord_z       = z->scalar(0, 0).str;
-        _coord_b       = b->scalar(0, 0).str;
-
-        /*
-        if(y)
-        {
-            // full load/store width
-        }
-        else
-        {
-            // no load/store
-        }
-        */
-    }
-
-    void write(const std::pair<int32_t, std::string>& y) override
-    {
-        int32_t     idx_y   = y.first;
-        std::string coord_y = y.second;
-
-        // The only check required is on Y.
-        out_of_bound_initialize_y(coord_y);
-
-        const std::string dst     = _dst->vector(idx_y).str;
-        const std::string sampler = to_ls_image2d_sampler();
-        const std::string coord   = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
-        const std::string ls_buf  = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
-
-        _writer->write_text(ls_buf);
-        _writer->write_text(";\n");
-
-        out_of_bound_finalize_y(dst);
-    }
-
-    void finalize() override
-    {
-    }
-private:
-    IVectorTile*   _dst { nullptr };
-    int32_t     _ls_width_full { 0 };
-    std::string _coord_x {};
-    std::string _coord_z {};
-    std::string _coord_b {};
-
-    void out_of_bound_initialize_y(std::string& coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch(address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            max = _mapper.tensor_component_y();
-            _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            _writer->write_text("if(" + coord + " >= 0)\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            max = _mapper.tensor_component_y();
-            _writer->write_text("if(" + coord + " < " + max + ")\n");
-            _writer->compound_statement_begin();
-            break;
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToNearest:
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-            case TensorSamplerAddressModeY::None:
-            break;
-            default:
-            std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
-            assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_y(const std::string& dst)
-    {
-        CKW_UNUSED(dst);
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch(address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            _writer->compound_statement_end();
-            break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string& data, const std::string& sampler, const std::string& coord)
-    {
-        CKW_UNUSED(vector_width);
-
-        auto tensor_storage            = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
-        const std::string image2d_obj  = _mapper.tensor_argument()->storage(tensor_storage);
-        // const DataType dt              = _dst->format().dt;
-        const std::string post_fix     = _dst->format().dt == DataType::Fp32? "f" : "h";
-
-        switch(type)
-        {
-            case GpuLoadStoreType::Load:
-            return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
-            break;
-            case GpuLoadStoreType::Store:
-            return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
-            default:
-            assert(false);
-            std::cout << "Unsupported GpuLoadStoreType" << std::endl;
-            assert(false);
-            return "";
-        }
-    }
-
-    std::string to_ls_image2d_sampler() const
-    {
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch(address_mode_y)
-        {
-            case TensorSamplerAddressModeY::None:
-            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
-            case TensorSamplerAddressModeY::ClampToNearest:
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-            return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
-            default:
-            std::cout << "Unsupported address_mode_coord" << std::endl;
-            assert(false);
-            return "";
-        }
-    }
-
-    std::string to_ls_image2d_coord(const std::string& x, const std::string& y, const std::string& z, const std::string& b) const
-    {
-        std::string coord_x = "(" + x + ") >> 2";
-        std::string coord_y = "(";
-
-        if(y != "0" && (_mapper.is_one_component_y() != true))
-        {
-            coord_y += y;
-        }
-        if(z != "0" && (_mapper.is_one_component_z() != true))
-        {
-            const std::string dim = _mapper.tensor_component_y();
-            coord_y += " + (";
-            coord_y += z + ")";
-            coord_y += " * ";
-            coord_y += dim;
-        }
-        if(b != "0" && (_mapper.is_one_component_batch() != true))
-        {
-            const std::string dim0 = _mapper.tensor_component_y();
-            const std::string dim1 = _mapper.tensor_component_z();
-            coord_y += " + (";
-            coord_y += b + ")";
-            coord_y += " * ";
-            coord_y += dim0;
-            coord_y += " * ";
-            coord_y += dim1;
-        }
-        coord_y += ")";
-        return "(int2)(" + coord_x + ", " + coord_y + ")";
-    }
-};
-
-/** IGpuLoadStoreHelperWriter factory class */
-class ClLoadStoreHelperWriterFactory final
-{
-public:
-    /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
-     *
-     *
-     * @return IGpuLoadStoreHelperWriter
-     */
-    static std::unique_ptr<IGpuLoadStoreHelperWriter> create(IGpuKernelWriter *x, const GpuTensor3dMapper& mapper, GpuLoadStoreType type)
-    {
-        const auto tensor_storage = mapper.gpu_sampler().storage;
-        switch(tensor_storage)
-        {
-            case GpuSamplerTensorStorage::BufferUint8Ptr:
-                return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
-            case GpuSamplerTensorStorage::Image2dReadOnly:
-            case GpuSamplerTensorStorage::Image2dWriteOnly:
-                return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
-            default:
-                std::cout << "Unsupported Gpu tensor storage" << std::endl;
-                assert(false);
-                return nullptr;
-        }
-    }
-};
-
-// This utility method needs to go in utils.h
-inline bool is_tile_scalar(IVectorTile* x)
-{
-    return x->format().w == 1 && x->format().h == 1;
-}
-
-class ClKernelWriter : public IGpuKernelWriter
-{
-public:
-    ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
-    {
-        _data = x;
-        _attr = attr;
-    }
-
-    ClKernelWriter(const ClKernelWriter &) = default;
-    ClKernelWriter &operator=(const ClKernelWriter &) = default;
-
-    // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
-    // there are no conflicts or ambiguity in the code
-    void set_IdSpace(int32_t id) override
-    {
-        _data->tiles.set_IdSpace(id);
-        _data->arguments.set_IdSpace(id);
-    }
-
-    void import_tile(const std::string& dst_name, const IVectorTile *src) override
-    {
-        _data->tiles.insert(dst_name, src);
-    }
-
-    void declare_argument(const std::string& name, const TensorInfo& tensor) override
-    {
-        assert(_data->arguments[name] == nullptr);
-        _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
-    }
-
-    void declare_tile(const std::string& name, const TileInfo& format) override
-    {
-        assert(_data->tiles[name] == nullptr);
-        _data->tiles.insert(name, format);
-
-        IVectorTile *x = _data->tiles[name];
-
-        for(auto &t : x->underlying_source_variables())
-        {
-            _data->code += t.type.str + " " + t.str + ";\n";
-        }
-    }
-
-    void declare_const_tile(const std::string& name, const std::vector<std::vector<std::string>>& in, DataType dt) override
-    {
-        assert(_data->tiles[name] == nullptr);
-        _data->tiles.insert(name, in, dt);
-        // Note: A constant does not need to be declared in the code
-    }
-
-    void write_text(const std::string& x) override
-    {
-        _data->code += x;
-    }
-
-    void compound_statement_begin() override
-    {
-        _data->tiles.increment_registry_level();
-        _data->code += "{\n";
-    }
-
-    void compound_statement_end() override
-    {
-        _data->tiles.decrement_registry_level();
-        _data->code += "}\n";
-    }
-
-    void op_get_global_id(const Operand& dst_var, int32_t dim) override
-    {
-        assert(dst_var.type() == OperandType::Tile);
-        assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
-               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
-        auto var = _data->tiles[dst_var.value()];
-
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " = get_global_id(";
-        _data->code += std::to_string(dim);
-        _data->code += ");\n";
-    };
-
-    void op_get_global_coord(const Operand& o_dst, const Operand& o_step, const TensorOperand& o_tensor, int32_t dim) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto dst  = operands.unpack(o_dst);
-        auto step = operands.unpack(o_step);
-
-        // Validation: Check that x, y and z are scalar
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto tensor      = tensor_operands.unpack(o_tensor);
-        auto gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        switch (dim)
-        {
-        case 0:
-            if(mapper.is_one_component_x())
-            {
-                _data->code += dst->scalar(0, 0).str;
-                _data->code += " = 0;\n";
-            }
-            else
-            {
-                if(mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-                {
-                    // Validation: Check: fixed tensor shape
-                    // TO BE CHANGED
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = get_global_id(0) * ";
-                    _data->code += step->scalar(0, 0).str;
-                    _data->code += ";\n";
-                }
-                else
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = get_global_id(0) * ";
-                    _data->code += step->scalar(0, 0).str;
-                    _data->code += ";\n";
-                }
-            }
-            break;
-        case 1:
-            if(mapper.is_one_component_y())
-            {
-                _data->code += dst->scalar(0, 0).str;
-                _data->code += " = 0;\n";
-            }
-            else
-            {
-                if(mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
-                {
-
-                }
-                else
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = get_global_id(1) * ";
-                    _data->code += step->scalar(0, 0).str;
-                    _data->code += ";\n";
-                }
-            }
-            break;
-        case 2:
-            if(mapper.is_one_component_z())
-            {
-                _data->code += dst->scalar(0, 0).str;
-                _data->code += " = 0;\n";
-            }
-            else
-            {
-                _data->code += dst->scalar(0, 0).str;
-                _data->code += " = get_global_id(2) * ";
-                _data->code += step->scalar(0, 0).str;
-                _data->code += ";\n";
-            }
-            break;
-        default:
-            break;
-        }
-    };
-
-    void op_get_global_batch(const Operand& o_dst, const TensorOperand& o_tensor) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto dst  = operands.unpack(o_dst);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto tensor      = tensor_operands.unpack(o_tensor);
-        auto gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        if(mapper.is_one_component_batch())
-        {
-            _data->code += dst->scalar(0, 0).str;
-            _data->code += " = 0;\n";
-        }
-        else
-        {
-            std::cout << "Unsupported batched computation" << std::endl;
-            assert(false);
-        }
-    };
-
-    void op_get_global_size(const Operand& dst_var, int32_t dim) override
-    {
-        assert(dst_var.type() == OperandType::Tile);
-        assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
-               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
-        auto var = _data->tiles[dst_var.value()];
-
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " = get_global_size(";
-        _data->code += std::to_string(dim);
-        _data->code += ");\n";
-    }
-
-    void op_binary_expression(const Operand& dst_name, const Operand& lhs_name, BinaryOp op, const Operand& rhs_name) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto lhs = operands.unpack(lhs_name);
-        auto rhs = operands.unpack(rhs_name);
-        auto dst = operands.unpack(dst_name);
-
-        const int32_t dst_w = dst->format().w;
-        const int32_t dst_h = dst->format().h;
-        assert(lhs != nullptr);
-        const int32_t lhs_w = lhs->format().w;
-        const int32_t rhs_w = rhs->format().w;
-
-        if(op == BinaryOp::MatMul_Nt_T)
-        {
-            assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
-            for(int32_t y = 0; y < dst_h; ++y)
-            {
-                for(int32_t x = 0; x < dst_w; ++x)
-                {
-                    for(int32_t k = 0; k < lhs_w; ++k)
-                    {
-                        _data->code += dst->scalar(x, y).str;
-                        _data->code += " = fma(";
-                        _data->code += lhs->scalar(k, y).str;
-                        _data->code += ", ";
-                        _data->code += rhs->scalar(k, x).str;
-                        _data->code += ", ";
-                        _data->code += dst->scalar(x, y).str;
-                        _data->code += ");\n";
-                    }
-                }
-            }
-
-            return;
-        }
-
-        bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
-        bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
-
-        std::string lhs_prefix = broadcast_lhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        std::string rhs_prefix = broadcast_rhs_x? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        std::string op_str     = to_string(op);
-
-        // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += lhs_prefix + lhs->vector(y).str;
-            _data->code += " ";
-            _data->code += op_str;
-            _data->code += " ";
-            _data->code += rhs_prefix + rhs->vector(y).str;
-            _data->code += ";\n";
-        }
-    };
-
-    void op_cast_expression(const Operand& o_dst, const Operand &o_src, ConvertPolicy policy) override
-    {
-        CKW_UNUSED(policy);
-
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto src = operands.unpack(o_src);
-        auto dst = operands.unpack(o_dst);
-
-        // const int32_t dst_w  = dst->format().w;
-        const int32_t dst_h  = dst->format().h;
-        const std::string dt = dst->scalar(0, 0).type.str;
-
-        // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = convert_" + dt + "(";
-            _data->code += src->vector(y).str;
-            _data->code += ");\n";
-        }
-    };
-
-    void op_assign(const Operand& dst_name, const Operand& src_name) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto src = operands.unpack(src_name);
-        auto dst = operands.unpack(dst_name);
-
-        const int32_t dst_w  = dst->format().w;
-        const int32_t dst_h  = dst->format().h;
-        const int32_t src_w  = src->format().w;
-        // const int32_t src_h  = src->format().h;
-        const std::string dt = dst->scalar(0, 0).type.str;
-
-        bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
-        std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
-
-        // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ";\n";
-        }
-    }
-
-    void op_scalar_function(const Operand& dst_name, const Operand& src_name, ScalarUnaryFunction func) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto src = operands.unpack(src_name);
-        auto dst = operands.unpack(dst_name);
-
-        const int32_t dst_w  = dst->format().w;
-        const int32_t dst_h  = dst->format().h;
-        const int32_t src_w  = src->format().w;
-        // const int32_t src_h  = src->format().h;
-        const std::string dt = dst->scalar(0, 0).type.str;
-
-        bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
-        std::string src_prefix = broadcast_src_x? "(" + dt + ")" : "";
-
-        // Broadcasting on Y is automatic
-        for(int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch(func)
-            {
-                case ScalarUnaryFunction::Exp:
-                    _data->code += "exp(";
-                    break;
-
-                default:
-                    CKW_ASSERT(false);
-            }
-
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_if(const Operand& o_lhs, BinaryOp op, const Operand& o_rhs) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto lhs = operands.unpack(o_lhs);
-        auto rhs = operands.unpack(o_rhs);
-
-        assert(is_tile_scalar(lhs));
-        assert(is_tile_scalar(rhs));
-
-        _data->code += "if(";
-        _data->code += lhs->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(op);
-        _data->code += " ";
-        _data->code += rhs->scalar(0, 0).str;
-        _data->code += ")\n";
-    }
-
-    void op_for_loop(const Operand& var_name, BinaryOp cond_op, const Operand& cond_value_name, AssignmentOp update_op, const Operand& update_value_name) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto var          = operands.unpack(var_name);
-        auto cond_value   = operands.unpack(cond_value_name);
-        auto update_value = operands.unpack(update_value_name);
-
-        const int32_t dst_w = var->format().w;
-        const int32_t dst_h = var->format().h;
-
-        // It must be a scalar variable
-        CKW_UNUSED(dst_w, dst_h);
-        assert(dst_w == 1);
-        assert(dst_h == 1);
-
-        _data->code += "for(; " ;
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(cond_op);
-        _data->code += " " + cond_value->scalar(0, 0).str + "; ";
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(update_op);
-        _data->code += " " + update_value->scalar(0, 0).str + ")";
-        _data->code += "\n";
-    }
-
-    void op_load_immediate(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_y, const Operand& o_z, const Operand& o_batch_idx, const Operand& dilation_y) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto dst      = operands.unpack(o_dst);
-        auto x        = operands.unpack(o_x);
-        auto y        = operands.unpack(o_y);
-        auto z        = operands.unpack(o_z);
-        auto dil_y    = operands.unpack(dilation_y);
-        auto b        = operands.unpack(o_batch_idx);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto tensor      = tensor_operands.unpack(o_tensor);
-        auto gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
-        // Initialize the constant part
-        load_writer->initialize(dst, x, z, b);
-
-        for(int i = 0; i < dst->format().h; ++i)
-        {
-            std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
-            if(dil_y->scalar(0, 0).str != "1")
-            {
-                coord_y += " * " + dil_y->scalar(0, 0).str;
-            }
-            load_writer->write(std::make_pair(i, coord_y));
-        }
-
-        load_writer->finalize();
-    }
-
-    void op_load_indirect(const TensorOperand& o_tensor, const Operand& o_dst, const Operand& o_x, const Operand& o_indirect_h, const Operand& o_z, const Operand& o_batch_idx) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto dst      = operands.unpack(o_dst);
-        auto x        = operands.unpack(o_x);
-        auto y_ind    = operands.unpack(o_indirect_h);
-        auto z        = operands.unpack(o_z);
-        auto b        = operands.unpack(o_batch_idx);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto tensor      = tensor_operands.unpack(o_tensor);
-        auto gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
-        // Initialize the constant part
-        load_writer->initialize(dst, x, z, b);
-
-        for(int i = 0; i < dst->format().h; ++i)
-        {
-            load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
-        }
-
-        load_writer->finalize();
-    }
-
-    void op_store_immediate(const TensorOperand& tensor_name, const Operand& src_name, const Operand& x_name, const Operand& y_name, const Operand& z_name, const Operand& batch_index_name) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto src      = operands.unpack(src_name);
-        auto x        = operands.unpack(x_name);
-        auto y        = operands.unpack(y_name);
-        auto z        = operands.unpack(z_name);
-        auto b        = operands.unpack(batch_index_name);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto tensor      = tensor_operands.unpack(tensor_name);
-        auto gpu_sampler = tensor_name.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
-
-        // Initialize the constant part
-        store_writer->initialize(src, x, z, b);
-
-        int32_t tile_h = src->format().h;
-
-        for(int m0 = tile_h - 1; m0 >= 0; m0--)
-        {
-            store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
-        }
-
-        store_writer->finalize();
-    }
-
-    void op_return() override
-    {
-        _data->code += "return;\n";
-    }
-
-    void util_get_indirect_buffer(const Operand& o_dst, const TensorOperand& o_tensor, const Operand& o_x, const Operand& o_y, const Operand& o_x_off, const Operand& o_y_off) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto dst    = operands.unpack(o_dst);
-        auto x      = operands.unpack(o_x);
-        auto y      = operands.unpack(o_y);
-        auto x_off  = operands.unpack(o_x_off);
-        auto y_off  = operands.unpack(o_y_off);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto tensor   = tensor_operands.unpack(o_tensor);
-
-        assert(dst->format().w == 1);
-        assert(x->format().w == 1);
-        assert(y->format().w == 1);
-        assert(x_off->format().w == 1);
-        assert(y_off->format().w == 1);
-        assert(dst->format().dt == DataType::Int32);
-        assert(x->format().dt == DataType::Int32);
-        assert(y->format().dt == DataType::Int32);
-        assert(x_off->format().dt == DataType::Int32);
-        assert(y_off->format().dt == DataType::Int32);
-
-        const std::string width  = tensor->component(TensorComponent::W);
-        const std::string height = tensor->component(TensorComponent::H);
-        const std::string wxh    = tensor->component(TensorComponent::WxH);
-        /*
-        int x_s;
-        int y_s;
-        x_s = (xi_0 + x_k);
-        y_s = (yi_0 + y_k);
-        mi_0 = x_s + y_s * width + b * widthxheight;
-        mi_0 = select(-1, mi_0, x_s >= 0);
-        mi_0 = select(-1, mi_0, y_s >= 0);
-        mi_0 = select(-1, mi_0, x_s < 128);
-        mi_0 = select(-1, mi_0, y_s < 128);
-        */
-        compound_statement_begin();
-        declare_tile("_x_s", TileInfo(DataType::Int32));
-        declare_tile("_y_s", TileInfo(DataType::Int32));
-        auto x_s = operands.unpack(Operand("_x_s"));
-        auto y_s = operands.unpack(Operand("_y_s"));
-        for(int i = 0; i < dst->format().h; ++i)
-        {
-            // x_s = (xi_0 + x_k);
-            // y_s = (yi_0 + y_k);
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " = (";
-            _data->code += x->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += x_off->scalar(0, i).str;
-            _data->code += ");\n";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " = (";
-            _data->code += y->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += y_off->scalar(0, i).str;
-            _data->code += ");\n";
-            // mi_0 = x_s + y_s * width;
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " * " + width + ";\n";
-            // mi_0 = select(wxh, mi_0, x_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, y_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, x_s < width);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " < ";
-            _data->code += width + ");\n";
-            // mi_0 = select(wxh, mi_0, y_s < height);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " < ";
-            _data->code += height + ");\n";
-        }
-        compound_statement_end();
-    }
-
-private:
-    GpuKernelWriterDataHolder* _data { nullptr };
-    GpuKernelWriterAttribute * _attr { nullptr };
-};
-
-/** IGpuKernelWriter factory class */
-class GpuKernelWriterFactory final
-{
-public:
-    /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return IGpuKernelWriter
-     */
-    static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
-    {
-        switch(x->programming_language())
-        {
-            case GpuTargetLanguage::OpenCL:
-                return std::make_unique<ClKernelWriter>(attr, x);
-            default:
-                std::cout << "Unsupported Gpu programming language" << std::endl;
-                assert(false);
-                return nullptr;
-        }
-    }
-};
-
-inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
-{
-    auto tensor = tensor_info_id->shape;
-
-    int32_t dim[3] = {0};
-
-    switch(tensor_format)
-    {
-        case TensorSamplerFormat::C_W_H:
-        dim[0] = tensor[0];
-        dim[1] = tensor[1];
-        dim[2] = tensor[2];
-        break;
-        case TensorSamplerFormat::C_WH_1:
-        dim[0] = tensor[0];
-        dim[1] = tensor[1] * tensor[2];
-        dim[2] = 1;
-        break;
-        default:
-        std::cout << "Unsupported tensor format" << std::endl;
-        assert(false);
-        break;
-    }
-
-    return std::min(step, dim[idx]);
-}
-
-} // namespace prototype
-} // namespace ckw
-
-#endif // CKW_SRC_PROTOTYPE_H
diff --git a/compute_kernel_writer/src/TensorOperand.cpp b/compute_kernel_writer/src/TensorOperand.cpp
deleted file mode 100644
index 00ecc3824e..0000000000
--- a/compute_kernel_writer/src/TensorOperand.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorOperand.h"
-#include "ckw/Error.h"
-#include "ckw/Kernel.h"
-#include "ckw/TileOperand.h"
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-namespace
-{
-
-inline TensorComponentOperand &get_or_create_component(std::unique_ptr<TensorComponentOperand> &ptr, const ::std::string &name, TensorComponent component)
-{
-    if(ptr == nullptr)
-    {
-        ptr = std::make_unique<TensorComponentOperand>(name, component);
-    }
-
-    return *ptr;
-}
-
-} // namespace
-
-// =================================================================================================
-// TensorOperand
-// =================================================================================================
-
-TensorOperand::TensorOperand(const std::string &name, const TensorInfo &info)
-    : OperandBase(name), _info(info)
-{
-}
-
-prototype::Operand TensorOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-    return { name() };
-}
-
-const TensorInfo &TensorOperand::info() const
-{
-    return _info;
-}
-
-TensorInfo &TensorOperand::info()
-{
-    return _info;
-}
-
-DataType TensorOperand::data_type() const
-{
-    return _info.data_type();
-}
-
-bool TensorOperand::is_constant() const
-{
-    return false;
-}
-
-const TileOperand &TensorOperand::tile() const
-{
-    return *_tile;
-}
-
-TileOperand &TensorOperand::tile()
-{
-    return *_tile;
-}
-
-TensorOperand &TensorOperand::tile(TileOperand &tile)
-{
-    _tile = &tile;
-    return *this;
-}
-
-const TensorTileSampler &TensorOperand::tile_sampler() const
-{
-    return _tile_sampler;
-}
-
-TensorTileSampler &TensorOperand::tile_sampler()
-{
-    return _tile_sampler;
-}
-
-TensorOperand &TensorOperand::tile_sampler(const TensorTileSampler &value)
-{
-    _tile_sampler = value;
-    return *this;
-}
-
-TileOperand &TensorOperand::stride1()
-{
-    return get_or_create_component(_stride1, name(), TensorComponent::Stride1);
-}
-
-TileOperand &TensorOperand::stride2()
-{
-    return get_or_create_component(_stride2, name(), TensorComponent::Stride2);
-}
-
-TileOperand &TensorOperand::stride3()
-{
-    return get_or_create_component(_stride3, name(), TensorComponent::Stride3);
-}
-
-TileOperand &TensorOperand::stride4()
-{
-    return get_or_create_component(_stride4, name(), TensorComponent::Stride4);
-}
-
-TileOperand &TensorOperand::dim0()
-{
-    return get_or_create_component(_dim0, name(), TensorComponent::Dim0);
-}
-
-TileOperand &TensorOperand::dim1()
-{
-    return get_or_create_component(_dim1, name(), TensorComponent::Dim1);
-}
-
-TileOperand &TensorOperand::dim2()
-{
-    return get_or_create_component(_dim2, name(), TensorComponent::Dim2);
-}
-
-TileOperand &TensorOperand::dim3()
-{
-    return get_or_create_component(_dim3, name(), TensorComponent::Dim3);
-}
-
-TileOperand &TensorOperand::dim4()
-{
-    return get_or_create_component(_dim4, name(), TensorComponent::Dim4);
-}
-
-TileOperand &TensorOperand::dim1_dim2()
-{
-    return get_or_create_component(_dim1_dim2, name(), TensorComponent::Dim1xDim2);
-}
-
-TileOperand &TensorOperand::dim1_dim2_dim3()
-{
-    return get_or_create_component(_dim1_dim2_dim3, name(), TensorComponent::Dim1xDim2xDim3);
-}
-
-TileOperand &TensorOperand::offset_first_element_in_bytes()
-{
-    return get_or_create_component(_offset_first_element_in_bytes, name(), TensorComponent::OffsetFirstElement);
-}
-
-// =================================================================================================
-// TensorComponentOperand
-// =================================================================================================
-
-TensorComponentOperand::TensorComponentOperand(const ::std::string &name, TensorComponent component)
-    : TileOperand(name, DataType::Int32), _component(component)
-{
-}
-
-prototype::Operand TensorComponentOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-    prototype::OperandType type{ prototype::OperandType::Unknown };
-
-    switch(_component)
-    {
-        case TensorComponent::OffsetFirstElement:
-            type = prototype::OperandType::TensorDataOffset;
-            break;
-
-        case TensorComponent::Stride1:
-            type = prototype::OperandType::TensorStride1;
-            break;
-
-        case TensorComponent::Stride2:
-            type = prototype::OperandType::TensorStride2;
-            break;
-
-        case TensorComponent::Stride3:
-            type = prototype::OperandType::TensorStride3;
-            break;
-
-        case TensorComponent::Stride4:
-            type = prototype::OperandType::TensorStride4;
-            break;
-
-        case TensorComponent::Dim0:
-            type = prototype::OperandType::TensorDim0;
-            break;
-
-        case TensorComponent::Dim1:
-            type = prototype::OperandType::TensorDim1;
-            break;
-
-        case TensorComponent::Dim2:
-            type = prototype::OperandType::TensorDim2;
-            break;
-
-        case TensorComponent::Dim3:
-            type = prototype::OperandType::TensorDim3;
-            break;
-
-        case TensorComponent::Dim4:
-            type = prototype::OperandType::TensorDim4;
-            break;
-
-        case TensorComponent::Dim1xDim2:
-            type = prototype::OperandType::TensorDim1xDim2;
-            break;
-
-        case TensorComponent::Dim1xDim2xDim3:
-            type = prototype::OperandType::TensorDim1xDim2xDim3;
-            break;
-
-        default:
-            CKW_ASSERT(false);
-    }
-
-    return prototype::Operand(name(), type);
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/src/TensorTileSampler.cpp b/compute_kernel_writer/src/TensorTileSampler.cpp
deleted file mode 100644
index 143d550dec..0000000000
--- a/compute_kernel_writer/src/TensorTileSampler.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-#include "ckw/Types.h"
-
-namespace ckw
-{
-
-TensorTileSampler::TensorTileSampler()
-{
-}
-
-TensorTileSampler::TensorTileSampler(
-    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-    TensorSamplerFormat       format,
-    TensorSamplerAddressModeX address_mode_x,
-    TensorSamplerAddressModeY address_mode_y,
-    TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x), _y(&y), _z(&z), _b(&b), _height(0), _width(0), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
-{
-}
-
-TensorTileSampler::TensorTileSampler(
-    TileOperand &x, TileOperand &y, TileOperand &z, TileOperand &b,
-    int32_t height, int32_t width,
-    TensorSamplerFormat       format,
-    TensorSamplerAddressModeX address_mode_x,
-    TensorSamplerAddressModeY address_mode_y,
-    TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x), _y(&y), _z(&z), _b(&b), _height(height), _width(width), _format(format), _address_mode_x(address_mode_x), _address_mode_y(address_mode_y), _address_mode_z(address_mode_z)
-{
-}
-
-const TileOperand &TensorTileSampler::x() const
-{
-    return *_x;
-}
-
-TensorTileSampler &TensorTileSampler::x(TileOperand &x)
-{
-    _x = &x;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::y() const
-{
-    return *_y;
-}
-
-TensorTileSampler &TensorTileSampler::y(TileOperand &y)
-{
-    _y = &y;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::z() const
-{
-    return *_z;
-}
-
-TensorTileSampler &TensorTileSampler::z(TileOperand &z)
-{
-    _z = &z;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::b() const
-{
-    return *_b;
-}
-
-TensorTileSampler &TensorTileSampler::b(TileOperand &b)
-{
-    _b = &b;
-    return *this;
-}
-
-int32_t TensorTileSampler::width() const
-{
-    return _width;
-}
-
-TensorTileSampler &TensorTileSampler::width(int32_t width)
-{
-    _width = width;
-    return *this;
-}
-
-int32_t TensorTileSampler::height() const
-{
-    return _height;
-}
-
-TensorTileSampler &TensorTileSampler::height(int32_t height)
-{
-    _height = height;
-    return *this;
-}
-
-TensorSamplerFormat TensorTileSampler::format() const
-{
-    return _format;
-}
-
-TensorTileSampler &TensorTileSampler::format(TensorSamplerFormat format)
-{
-    _format = format;
-    return *this;
-}
-
-TensorSamplerAddressModeX TensorTileSampler::address_mode_x() const
-{
-    return _address_mode_x;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_x(TensorSamplerAddressModeX address_mode_x)
-{
-    _address_mode_x = address_mode_x;
-    return *this;
-}
-
-TensorSamplerAddressModeY TensorTileSampler::address_mode_y() const
-{
-    return _address_mode_y;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_y(TensorSamplerAddressModeY address_mode_y)
-{
-    _address_mode_y = address_mode_y;
-    return *this;
-}
-
-TensorSamplerAddressModeZ TensorTileSampler::address_mode_z() const
-{
-    return _address_mode_z;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_z(TensorSamplerAddressModeZ address_mode_z)
-{
-    _address_mode_z = address_mode_z;
-    return *this;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/src/TileOperand.cpp b/compute_kernel_writer/src/TileOperand.cpp
deleted file mode 100644
index 091947628d..0000000000
--- a/compute_kernel_writer/src/TileOperand.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TileOperand.h"
-#include "ckw/Error.h"
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name), _info(info), _value{ 0 }, _constant(false)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name), _info(TileInfo{ data_type }), _value(0), _constant(false)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, int32_t value)
-    : OperandBase(name), _info(TileInfo{ DataType::Int32 }), _value(value), _constant(true)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, float value)
-    : OperandBase(name), _info(TileInfo{ DataType::Fp32 }), _value(value), _constant(true)
-{
-}
-
-prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-
-    if(_constant)
-    {
-        switch(_info.data_type())
-        {
-            case DataType::Int32:
-                return prototype::Operand(std::to_string(_value.get<int32_t>()), prototype::OperandType::ScalarInt32);
-
-            case DataType::Fp32:
-                return prototype::Operand(std::to_string(_value.get<float>()), prototype::OperandType::ScalarFp32);
-
-            default:
-                CKW_ASSERT(false);
-        }
-    }
-    else
-    {
-        return prototype::Operand(name(), prototype::OperandType::Tile);
-    }
-}
-
-const TileInfo &TileOperand::tile_info() const
-{
-    return _info;
-}
-
-DataType TileOperand::data_type() const
-{
-    return _info.data_type();
-}
-
-bool TileOperand::is_constant() const
-{
-    return _constant;
-}
-
-bool TileOperand::is_scalar() const
-{
-    return _info.width() == 1 && _info.height() == 1;
-}
-
-ScalarValue TileOperand::scalar_value() const
-{
-    CKW_ASSERT(is_scalar());
-    CKW_ASSERT(is_constant());
-
-    return _value;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/src/acl/AclComponentArgument.cpp b/compute_kernel_writer/src/acl/AclComponentArgument.cpp
deleted file mode 100644
index 5cb909021e..0000000000
--- a/compute_kernel_writer/src/acl/AclComponentArgument.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "acl/AclComponentArgument.h"
-#include "ckw/Error.h"
-
-AclComponentArgument::AclComponentArgument()
-{
-}
-
-AclComponentArgument::AclComponentArgument(ckw::TensorOperand &tensor)
-    : _tensor(&tensor)
-{
-}
-
-AclComponentArgument &AclComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
-{
-    CKW_ASSERT(_tile == nullptr);
-
-    _tile         = &tile;
-    _tile_sampler = tile_sampler;
-
-    return *this;
-}
-
-bool AclComponentArgument::has_tensor() const
-{
-    return _tensor != nullptr;
-}
-
-ckw::TensorOperand &AclComponentArgument::tensor()
-{
-    CKW_ASSERT(_tensor != nullptr);
-
-    return *_tensor;
-}
-
-const ckw::TensorOperand &AclComponentArgument::tensor() const
-{
-    CKW_ASSERT(_tensor != nullptr);
-
-    return *_tensor;
-}
-
-bool AclComponentArgument::has_tile() const
-{
-    return _tile != nullptr;
-}
-
-ckw::TileOperand &AclComponentArgument::tile()
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return *_tile;
-}
-
-const ckw::TileOperand &AclComponentArgument::tile() const
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return *_tile;
-}
-
-ckw::TensorTileSampler &AclComponentArgument::tile_sampler()
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return _tile_sampler;
-}
-
-const ckw::TensorTileSampler &AclComponentArgument::tile_sampler() const
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return _tile_sampler;
-}
diff --git a/compute_kernel_writer/src/acl/AclKernelWriter.cpp b/compute_kernel_writer/src/acl/AclKernelWriter.cpp
deleted file mode 100644
index a44e798c61..0000000000
--- a/compute_kernel_writer/src/acl/AclKernelWriter.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "acl/AclKernelWriter.h"
-#include "acl/AclComponentArgument.h"
-#include "ckw/Error.h"
-#include "ckw/TileInfo.h"
-
-AclKernelWriter::AclKernelWriter(ckw::Kernel &kernel)
-    : KernelWriter(kernel)
-{
-}
-
-void AclKernelWriter::op_load_once(AclComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
-{
-    if(!tensor_or_tile->has_tile())
-    {
-        CKW_ASSERT(tensor_or_tile->has_tensor());
-
-        auto &tensor = tensor_or_tile->tensor();
-
-        const auto tile_name = tensor.name() + "_tile";
-        auto      &tile      = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
-
-        op_load(tile, tensor, sampler);
-
-        tensor_or_tile->init_virtual_tensor(tile, sampler);
-    }
-}
diff --git a/compute_kernel_writer/src/acl/AclScopedKernelWriter.cpp b/compute_kernel_writer/src/acl/AclScopedKernelWriter.cpp
deleted file mode 100644
index 2a73d47592..0000000000
--- a/compute_kernel_writer/src/acl/AclScopedKernelWriter.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "acl/AclScopedKernelWriter.h"
-#include "acl/AclKernelWriter.h"
-
-AclScopedKernelWriter::AclScopedKernelWriter(AclKernelWriter *writer)
-    : _writer(writer), _parent_id_space(writer->id_space())
-{
-    _writer->next_id_space();
-}
-
-AclScopedKernelWriter::AclScopedKernelWriter(const AclScopedKernelWriter &other)
-    : _writer(other._writer), _parent_id_space(other._writer->id_space())
-{
-    _writer->next_id_space();
-}
-
-AclKernelWriter *AclScopedKernelWriter::operator->()
-{
-    return _writer;
-}
-
-const AclKernelWriter *AclScopedKernelWriter::operator->() const
-{
-    return _writer;
-}
-
-AclKernelWriter *AclScopedKernelWriter::writer()
-{
-    return _writer;
-}
-
-const AclKernelWriter *AclScopedKernelWriter::writer() const
-{
-    return _writer;
-}
diff --git a/filelist.json b/filelist.json
index a8f6ffc19c..2214e47459 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2338,9 +2338,12 @@
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
       "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
+      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp",
       "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
       "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
       "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
+      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
+      "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp",
       "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
       "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp"
     ]
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 56b33f3922..11ca6c45f5 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -28,7 +28,7 @@ import re
 import sys
 
 def get_list_includes():
-    return "compute_kernel_writer/include " \
+    return "compute_kernel_writer/prototype/include " \
            "src/cpu/kernels/assembly " \
            "src/core/NEON/kernels/assembly " \
            "src/core/NEON/kernels/convolution/winograd " \
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
new file mode 100644
index 0000000000..4b4c22fa1d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "ckw/Error.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwComponentArgument::GpuCkwComponentArgument()
+{
+}
+
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor)
+    : _tensor(&tensor)
+{
+}
+
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+{
+    CKW_ASSERT(_tile == nullptr);
+
+    _tile         = &tile;
+    _tile_sampler = tile_sampler;
+
+    return *this;
+}
+
+bool GpuCkwComponentArgument::has_tensor() const
+{
+    return _tensor != nullptr;
+}
+
+ckw::TensorOperand &GpuCkwComponentArgument::tensor()
+{
+    CKW_ASSERT(_tensor != nullptr);
+
+    return *_tensor;
+}
+
+const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const
+{
+    CKW_ASSERT(_tensor != nullptr);
+
+    return *_tensor;
+}
+
+bool GpuCkwComponentArgument::has_tile() const
+{
+    return _tile != nullptr;
+}
+
+ckw::TileOperand &GpuCkwComponentArgument::tile()
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return *_tile;
+}
+
+const ckw::TileOperand &GpuCkwComponentArgument::tile() const
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return *_tile;
+}
+
+ckw::TensorTileSampler &GpuCkwComponentArgument::tile_sampler()
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return _tile_sampler;
+}
+
+const ckw::TensorTileSampler &GpuCkwComponentArgument::tile_sampler() const
+{
+    CKW_ASSERT(_tile != nullptr);
+
+    return _tile_sampler;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
new file mode 100644
index 0000000000..80f91389a0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+
+#include "ckw/TensorTileSampler.h"
+
+namespace ckw
+{
+class TensorOperand;
+class TileOperand;
+} // namespace ckw
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
+class GpuCkwComponentArgument
+{
+public:
+    /** Initialize a new instance of @ref GpuCkwComponentArgument class for empty virtual tensor. */
+    GpuCkwComponentArgument();
+
+    /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor.
+     *
+     * @param[in] tensor The user tensor.
+     */
+    explicit GpuCkwComponentArgument(ckw::TensorOperand &tensor);
+
+    /** Set virtual tensor information (tile, sampler) for the argument.
+     *
+     * If the component is a user tensor, it can be treated as virtual tensor as well
+     * and won't be loaded again using @ref GpuCkwKernelWriter::op_load_once method.
+     *
+     * @param[in] tile    The tile that has been loaded.
+     * @param[in] sampler The tensor sampling information that has been used to load the tile.
+     */
+    GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
+
+    /** Get whether the argument is a user tensor. */
+    bool has_tensor() const;
+
+    /** Get the tensor operand.
+     *
+     * If the tensor is not available, throw an error.
+     */
+    ckw::TensorOperand &tensor();
+
+    /** Get the tensor operand.
+     *
+     * If the tensor is not available, throw an error.
+     */
+    const ckw::TensorOperand &tensor() const;
+
+    /** Get whether the argument contains a tile.
+     *
+     * The argument can be either a user tensor that has been loaded,
+     * or a virtual tensor (i.e. a tile with tensor sampling information).
+     */
+    bool has_tile() const;
+
+    /** Get the tile operand.
+     *
+     * If the tile is not available, throw an error.
+     */
+    ckw::TileOperand &tile();
+
+    /** Get the tile operand.
+     *
+     * If the tile is not available, throw an error.
+     */
+    const ckw::TileOperand &tile() const;
+
+    /** Get the tensor sampling information for the tile.
+     *
+     * If the tile is not available, throw an error.
+     */
+    ckw::TensorTileSampler &tile_sampler();
+
+    /** Get the tensor sampling information for the tile.
+     *
+     * If the tile is not available, throw an error.
+     */
+    const ckw::TensorTileSampler &tile_sampler() const;
+
+private:
+    ckw::TensorOperand *_tensor{ nullptr };
+    ckw::TileOperand   *_tile{ nullptr };
+    ckw::TensorTileSampler  _tile_sampler{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index 7c8ec8777d..d5c03c60c5 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -31,8 +31,8 @@
 #include "src/common/utils/Log.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
-#include "acl/AclKernelWriter.h"
-#include "acl/AclScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 
 using namespace ckw;
 namespace arm_compute
@@ -55,10 +55,10 @@ std::string GpuCkwDriver::get_name()
 std::string GpuCkwDriver::get_code()
 {
     ARM_COMPUTE_LOG_PARAMS(std::string("[V1] TODO"));
-    ckw::Kernel           kernel(get_name().c_str(), GpuTargetLanguage::OpenCL);
-    AclKernelWriter       root_writer(kernel);
-    AclScopedKernelWriter writer(&root_writer);
-    GpuCkwVariableTable   vtable{};
+    ckw::Kernel              kernel(get_name().c_str(), GpuTargetLanguage::OpenCL);
+    GpuCkwKernelWriter       root_writer(kernel);
+    GpuCkwScopedKernelWriter writer(&root_writer);
+    GpuCkwVariableTable      vtable{};
 
     // Global Kernel Writer Driver code
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
new file mode 100644
index 0000000000..ca4f121566
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "ckw/Error.h"
+#include "ckw/TileInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel)
+    : KernelWriter(kernel)
+{
+}
+
+void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
+{
+    if(!tensor_or_tile->has_tile())
+    {
+        CKW_ASSERT(tensor_or_tile->has_tensor());
+
+        auto &tensor = tensor_or_tile->tensor();
+
+        const auto tile_name = tensor.name() + "_tile";
+        auto      &tile      = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+
+        op_load(tile, tensor, sampler);
+
+        tensor_or_tile->init_virtual_tensor(tile, sampler);
+    }
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h
new file mode 100644
index 0000000000..b916e6b28b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
+
+#include "ckw/KernelWriter.h"
+#include "ckw/TensorTileSampler.h"
+
+namespace ckw
+{
+class Kernel;
+} // namespace ckw
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+class GpuCkwComponentArgument;
+
+/** Extended implementation of kernel writer for dynamic fusion. */
+class GpuCkwKernelWriter : public ckw::KernelWriter
+{
+public:
+    /** Initialize a new instance of @ref GpuCkwKernelWriter class.
+     *
+     * @param[in] kernel The kernel to be generated.
+     */
+    explicit GpuCkwKernelWriter(ckw::Kernel &kernel);
+
+    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
+     *
+     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
+     * @param[in] sampler        The tensor sampling information to load the tile.
+     */
+    void op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
new file mode 100644
index 0000000000..043fda9e6f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(GpuCkwKernelWriter *writer)
+    : _writer(writer), _parent_id_space(writer->id_space())
+{
+    _writer->next_id_space();
+}
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other)
+    : _writer(other._writer), _parent_id_space(other._writer->id_space())
+{
+    _writer->next_id_space();
+}
+
+GpuCkwKernelWriter *GpuCkwScopedKernelWriter::operator->()
+{
+    return _writer;
+}
+
+const GpuCkwKernelWriter *GpuCkwScopedKernelWriter::operator->() const
+{
+    return _writer;
+}
+
+GpuCkwKernelWriter *GpuCkwScopedKernelWriter::writer()
+{
+    return _writer;
+}
+
+const GpuCkwKernelWriter *GpuCkwScopedKernelWriter::writer() const
+{
+    return _writer;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
new file mode 100644
index 0000000000..4d11b5e3e4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+class GpuCkwKernelWriter;
+
+/** Helper to automatically manage kernel writer ID space. */
+class GpuCkwScopedKernelWriter
+{
+public:
+    /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */
+    explicit GpuCkwScopedKernelWriter(GpuCkwKernelWriter *writer);
+
+    /** Create a new scope from the specified scoped kernel writer. */
+    GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other);
+
+    /** Assignment is disallowed. */
+    GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete;
+
+    /** Access the underlying kernel writer. */
+    GpuCkwKernelWriter *operator->();
+
+    /** Access the underlying kernel writer. */
+    const GpuCkwKernelWriter *operator->() const;
+
+    /** Get the kernel writer. */
+    GpuCkwKernelWriter *writer();
+
+    /** Get the kernel writer. */
+    const GpuCkwKernelWriter *writer() const;
+
+private:
+    GpuCkwKernelWriter *_writer;
+    int32_t          _parent_id_space;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 85aed282d1..4475586db8 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -23,8 +23,8 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
-#include "acl/AclKernelWriter.h"
-#include "acl/AclScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/TypeConverter.h"
 #include <sstream>
@@ -35,7 +35,7 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-AclComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, AclScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias)
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
@@ -49,8 +49,8 @@ AclComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCompo
     if(comp_group.is_intermediate_tensor(tensor))
     {
         // Create a virtual tensor variable
-        AclComponentArgument var;
-        auto               &&inserted = _vars.emplace(tensor->id(), var);
+        GpuCkwComponentArgument var;
+        auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
     else
@@ -58,9 +58,9 @@ AclComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCompo
         // Create a user tensor variable
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
-        const auto           uniq_name = ss.str();
-        AclComponentArgument var{ writer->create_tensor_argument(uniq_name.c_str(), to_ckw(*tensor)) };
-        auto               &&inserted = _vars.emplace(tensor->id(), var);
+        const auto              uniq_name = ss.str();
+        GpuCkwComponentArgument var{ writer->create_tensor_argument(uniq_name.c_str(), to_ckw(*tensor)) };
+        auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 170fda451b..1c9cb083ea 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -24,13 +24,11 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 
-#include "acl/AclComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "arm_compute/core/ITensorInfo.h"
 
 #include <map>
 
-class AclScopedKernelWriter;
-
 namespace arm_compute
 {
 namespace experimental
@@ -38,6 +36,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 class GpuKernelComponentGroup;
+class GpuCkwScopedKernelWriter;
 
 /** A table of all the variables used in the kernel.
  *
@@ -55,12 +54,12 @@ public:
      * @param[in] tensor     Tensor info with which the new variable is associated
      * @param[in] alias      Alias for the variable. Will be used as part of the variable name
      *
-     * @return AclComponentArgument*
+     * @return GpuCkwComponentArgument*
      */
-    AclComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, AclScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias = "unnamed");
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, const std::string &alias = "unnamed");
 
 private:
-    std::map<ITensorInfo::Id, AclComponentArgument> _vars{};
+    std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
index 15402bc330..62255f1cf6 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
@@ -28,8 +28,6 @@
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 
-class AclScopedKernelWriter;
-
 namespace arm_compute
 {
 class ITensorInfo;
@@ -40,6 +38,7 @@ namespace dynamic_fusion
 /** Forward declaration */
 class GpuKernelComponentGroup;
 class GpuCkwVariableTable;
+class GpuCkwScopedKernelWriter;
 
 /** An interface used by @ref GpuCkwDriver to write source code for a kernel component
  *
@@ -90,7 +89,7 @@ public:
      *
      *                            @note @p writer can only be passed via value since the new scope is created in the copy constructor
      */
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, AclScopedKernelWriter writer) const = 0;
+    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0;
     /** Get tensor arguments */
     ArgumentPack<ITensorInfo> tensors() const
     {
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index 9895bbeb77..cba1cfbe40 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -23,8 +23,8 @@
  */
 #include "GpuCkwElementwiseBinary.h"
 
-#include "acl/AclKernelWriter.h"
-#include "acl/AclScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
@@ -40,11 +40,13 @@ namespace arm_compute
 {
 namespace experimental
 {
+namespace dynamic_fusion
+{
 namespace
 {
 /** Create a simple sampler from tile of dimension [m0, n0]
  */
-inline TensorTileSampler create_simple_sampler(AclScopedKernelWriter &writer, int32_t m0, int32_t n0)
+inline TensorTileSampler create_simple_sampler(GpuCkwScopedKernelWriter &writer, int32_t m0, int32_t n0)
 {
     TensorTileSampler sampler;
 
@@ -75,32 +77,31 @@ inline TensorTileSampler create_simple_sampler(AclScopedKernelWriter &writer, in
 }
 } // namespace
 
-namespace dynamic_fusion
-{
 GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId                      id,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
     : IGpuCkwComponentDriver{ id, tensors },
       _lhs{},
       _rhs{},
-      _dst{},
-      _attributes{ attributes }
+      _dst{}
 {
+    ARM_COMPUTE_UNUSED(attributes);
+
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
 }
 
-void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, AclScopedKernelWriter writer) const
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    AclComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
-    AclComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
-    AclComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
     // Load the LHS and RHS tiles and prepare the tensor sampler.
     load_lhs_rhs_tiles_and_prepare_sampler(writer, lhs, rhs, m0, n0, create_simple_sampler);
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
index 1a79754d1d..963b92baf9 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -53,14 +53,13 @@ public:
     /** Destructor */
     ~GpuCkwElementwiseBinary() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, AclScopedKernelWriter writer) const override;
+    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
     Window get_window() const override;
 
 private:
     const ITensorInfo *_lhs;
     const ITensorInfo *_rhs;
     const ITensorInfo *_dst;
-    Attributes         _attributes;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 1a1dfc135a..63555e6064 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -24,8 +24,8 @@
 #include "GpuCkwStore.h"
 
 #include "arm_compute/core/Error.h"
-#include "compute_kernel_writer/include/acl/AclKernelWriter.h"
-#include "compute_kernel_writer/include/acl/AclScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include <string>
 
@@ -41,7 +41,7 @@ GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensor
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
 }
-void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, AclScopedKernelWriter writer) const
+void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
 {
     auto src = vtable.declare_variable(comp_group, writer, _src, "src");
     auto dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
index 45cc43fe62..5728ff9f49 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -48,7 +48,7 @@ public:
     /** Destructor */
     ~GpuCkwStore() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, AclScopedKernelWriter writer) const override;
+    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
 
 private:
     const ITensorInfo *_src;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
index d94ebd5ce9..ca13329335 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
@@ -24,8 +24,8 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER
 
-#include "acl/AclComponentArgument.h"
-#include "acl/AclScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "ckw/TensorTileSampler.h"
 
 #include <functional>
@@ -37,11 +37,11 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-using SamplerCreator = std::function<TensorTileSampler(AclScopedKernelWriter &, int32_t /* m0 */, int32_t /* n0 */)>;
+using SamplerCreator = std::function<TensorTileSampler(GpuCkwScopedKernelWriter &, int32_t /* m0 */, int32_t /* n0 */)>;
 
 /** Load lhs and rhs tiles of dimension [m0, n0] only when not loaded and prepare the sampler
  */
-inline void load_lhs_rhs_tiles_and_prepare_sampler(AclScopedKernelWriter &writer, AclComponentArgument *lhs, AclComponentArgument *rhs, int32_t m0, int32_t n0, SamplerCreator create_sampler)
+inline void load_lhs_rhs_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer, GpuCkwComponentArgument *lhs, GpuCkwComponentArgument *rhs, int32_t m0, int32_t n0, SamplerCreator create_sampler)
 {
     if(!lhs->has_tile() && !rhs->has_tile())
     {
-- 
cgit v1.2.1