Remove CKW prototype and Template Writer

Gpu code in dynamic fusion is now written by stable CKW. We do not need CKW protoype and the older writer implementation, i.e. TemplateWriter. It also removes the need for the flag -DACL_INTERNAL_TEST_CKW_IN_DF to compile and test dynamic fusion operator. Resolves: COMPMID-6715 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Change-Id: I9f9453311e79d9be612bd4754240d832f98503e8 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11116 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2024-02-07 15:34:45 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> 2024-02-09 15:59:45 +0000
commit: 0ee13afc4429411de9a05ba4c2ff8a580784b568 (patch)
tree: c9ee1acf684d52b92ffb7500b0b65eee8377ce45
parent: a3e1b50588b89a2c0c67da2679728a422fc16402 (diff)
download: ComputeLibrary-0ee13afc4429411de9a05ba4c2ff8a580784b568.tar.gz
115 files changed, 230 insertions, 15816 deletions
diff --git a/Android.bp b/Android.bp
index 2983e2e21d..670138b209 100644
--- a/Android.bp
+++ b/Android.bp
@@ -172,6 +172,7 @@ cc_library_static {
     proprietary: true,
     local_include_dirs: ["build/android-arm64v8a/src/core",
                          "build/android-arm64v8a/src/core/CL",
+                         "compute_kernel_writer/include",
                          "src/core/common",
                          "src/core/helpers",
                          "src/core/NEON/kernels/arm_gemm",
@@ -621,7 +622,6 @@ cc_library_static {
         "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
-        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
@@ -634,8 +634,6 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
@@ -657,19 +655,6 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
         "src/gpu/cl/ClContext.cpp",
         "src/gpu/cl/ClKernelLibrary.cpp",
         "src/gpu/cl/ClQueue.cpp",
diff --git a/SConscript b/SConscript
index f0c42979ce..a9986feb32 100644
--- a/SConscript
+++ b/SConscript
@@ -564,12 +564,6 @@ if env['fixed_format_kernels']:
 # Dynamic fusion
 if env['experimental_dynamic_fusion']:
     lib_files += filelist['experimental']['dynamic_fusion']['common']
-    lib_files += filelist['experimental']['dynamic_fusion']['template_writer']
-
-if "ACL_INTERNAL_TEST_CKW_IN_DF" in env["extra_cxx_flags"]:
-    if not env["experimental_dynamic_fusion"]:
-        print("To use ACL_INTERNAL_TEST_CKW_IN_DF experimental_dynamic_fusion must be set to 1")
-        Exit(1)
     lib_files += filelist['experimental']['dynamic_fusion']['ckw_driver']
 
 # Logging files
diff --git a/SConstruct b/SConstruct
index 6f498b51c8..bad85e503d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -227,9 +227,6 @@ if env['experimental_dynamic_fusion']:
     # Dynamic Fusion on GPU has a direct dependency on OpenCL and Compute Kernel Writer
     env['opencl'] = 1
 
-    # Build CKW by default
-    env["extra_cxx_flags"] += ' -DACL_INTERNAL_TEST_CKW_IN_DF'
-
 if env['opencl'] and env['embed_kernels'] and env['compress_kernels'] and env['os'] not in ['android']:
     print("Compressed kernels are supported only for android builds")
     Exit(1)
diff --git a/compute_kernel_writer/prototype/CMakeLists.txt b/compute_kernel_writer/prototype/CMakeLists.txt
deleted file mode 100644
index 439dcd3b7e..0000000000
--- a/compute_kernel_writer/prototype/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2023 Arm Limited.
-#
-# SPDX-License-Identifier: MIT
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-#---------------------------------------------------------------------
-# Prototype
-
-add_library(ckw_prototype
-    src/TileInfo.cpp
-    src/TensorInfo.cpp
-    src/Kernel.cpp
-    src/KernelWriter.cpp
-    src/OperandBase.cpp
-    src/TileOperand.cpp
-    src/TensorOperand.cpp
-    src/TensorTileSampler.cpp
-    src/KernelArgument.cpp
-)
-
-target_compile_options(ckw_prototype
-    PUBLIC
-    ${CKW_CXX_FLAGS}
-    "$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
-    "$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
-    "$<$<BOOL:${CKW_ENABLE_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
-    ${CMAKE_CXX_FLAGS}
-    PRIVATE
-    $<$<CONFIG:Release>:-Os>
-)
-
-target_compile_definitions(ckw_prototype PUBLIC
-    $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
-    $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ENABLE_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
-)
-
-target_include_directories(ckw_prototype
-    PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include
-    PRIVATE ${CMAKE_CURRENT_LIST_DIR}
-)
-
-#---------------------------------------------------------------------
-# Examples
-
-add_library(ckw_prototype_examples_common
-    examples/common/ExampleKernelWriter.cpp
-    examples/common/ExampleScopedKernelWriter.cpp
-    examples/common/ExampleComponentArgument.cpp
-)
-
-target_link_libraries(ckw_prototype_examples_common PUBLIC ckw_prototype)
-
-add_executable(ckw_prototype_examples_add_exp_store examples/add_exp_store.cpp)
-target_link_libraries(ckw_prototype_examples_add_exp_store PUBLIC ckw_prototype_examples_common)
-
-add_executable(writer_helper examples/writer_helper.cpp)
-target_link_libraries(writer_helper PUBLIC ckw_prototype)
diff --git a/compute_kernel_writer/prototype/examples/add_exp_store.cpp b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
deleted file mode 100644
index 2b640ca01b..0000000000
--- a/compute_kernel_writer/prototype/examples/add_exp_store.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/Error.h"
-#include "ckw/KernelArgument.h"
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-
-#include "common/ExampleComponentArgument.h"
-#include "common/ExampleKernelWriter.h"
-#include "common/ExampleScopedKernelWriter.h"
-#include <iostream>
-#include <vector>
-
-using namespace ckw;
-
-TensorTileSampler create_simple_sampler(ExampleScopedKernelWriter writer)
-{
-    TensorTileSampler sampler;
-
-    constexpr int32_t m0 = 4;
-    constexpr int32_t n0 = 4;
-
-    auto &gid_0 = writer->declare_tile("gid_0", DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    sampler.x(gid_0);
-    sampler.y(gid_1);
-    sampler.z(const_0);
-    sampler.b(gid_2);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1);
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-
-    return sampler;
-}
-
-void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
-{
-    auto lhs = operands.at(0);
-    auto rhs = operands.at(1);
-    auto dst = operands.at(2);
-
-    // Load the LHS and RHS tile and prepare the tensor sampler.
-    if (!lhs->has_tile() && !rhs->has_tile())
-    {
-        const auto sampler = create_simple_sampler(writer);
-
-        writer->op_load_once(lhs, sampler);
-        writer->op_load_once(rhs, sampler);
-    }
-    else if (lhs->has_tile())
-    {
-        const auto &sampler = lhs->tile_sampler();
-        writer->op_load_once(rhs, sampler);
-    }
-    else
-    {
-        const auto &sampler = rhs->tile_sampler();
-        writer->op_load_once(lhs, sampler);
-    }
-
-    auto       &lhs_tile = lhs->tile();
-    auto       &rhs_tile = rhs->tile();
-    const auto &sampler  = lhs->tile_sampler();
-
-    // Prepare the output tile.
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", lhs_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-
-    auto &dst_tile = dst->tile();
-
-    // Perform the operation.
-    writer->op_binary_expression(dst_tile, lhs_tile, BinaryOp::Add, rhs_tile);
-}
-
-void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
-{
-    auto src = operands.at(0);
-    auto dst = operands.at(1);
-
-    // Load the source tile and prepare the sampler.
-    if (!src->has_tile())
-    {
-        const auto sampler = create_simple_sampler(writer);
-        writer->op_load_once(src, sampler);
-    }
-
-    auto       &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
-
-    // Prepare the output tile.
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-
-    auto &dst_tile = dst->tile();
-
-    // Perform the operation.
-    writer->op_unary_elementwise_function(dst_tile, UnaryFunction::Exp, src_tile);
-}
-
-void op_store(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
-{
-    auto src = operands.at(0);
-    auto dst = operands.at(1);
-
-    auto       &src_tile   = src->tile();
-    const auto &sampler    = src->tile_sampler();
-    auto       &dst_tensor = dst->tensor();
-
-    writer->op_store(dst_tensor, src_tile, sampler);
-}
-
-int main()
-{
-    Kernel              kernel("example", GpuTargetLanguage::OpenCL);
-    ExampleKernelWriter root_writer(kernel);
-
-    ExampleScopedKernelWriter writer(&root_writer);
-
-    const TensorInfo src0_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 0);
-    const TensorInfo src1_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 1);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 2);
-
-    ExampleComponentArgument src0(
-        writer->declare_tensor_argument("src0", src0_info, TensorStorageType::BufferUint8Ptr));
-    ExampleComponentArgument src1(
-        writer->declare_tensor_argument("src1", src1_info, TensorStorageType::BufferUint8Ptr));
-    ExampleComponentArgument dst(writer->declare_tensor_argument("dst", dst_info, TensorStorageType::BufferUint8Ptr));
-
-    ExampleComponentArgument ans;
-
-    op_binary_elementwise(writer, {&src0, &src1, &ans});
-    op_exp(writer, {&ans, &ans});
-    op_store(writer, {&ans, &dst});
-
-    const auto arguments = kernel.arguments();
-
-    std::cout << "\n====================\nArguments:\n====================\n";
-
-    for (auto &arg : arguments)
-    {
-        switch (arg.type())
-        {
-            case ckw::KernelArgument::Type::TensorStorage:
-                std::cout << "* Tensor storage:   ID = " << arg.id() << ", type = " << std::hex << "0x"
-                          << static_cast<uint32_t>(arg.tensor_storage_type()) << std::dec << "\n";
-                break;
-
-            case ckw::KernelArgument::Type::TensorComponent:
-                std::cout << "* Tensor component: ID = " << arg.id() << ", type = " << std::hex << "0x"
-                          << static_cast<uint32_t>(arg.tensor_component_type()) << std::dec << "\n";
-                break;
-
-            default:
-                CKW_ASSERT(false);
-        }
-    }
-
-    std::cout << "\n====================\nCode:\n====================\n";
-    const auto code = root_writer.generate_code();
-    std::cout << code;
-
-    return 0;
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
deleted file mode 100644
index 55223dae0e..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ExampleComponentArgument.h"
-
-#include "ckw/Error.h"
-
-ExampleComponentArgument::ExampleComponentArgument()
-{
-}
-
-ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
-{
-}
-
-ExampleComponentArgument &ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
-                                                                        const ckw::TensorTileSampler &tile_sampler)
-{
-    CKW_ASSERT(_tile == nullptr);
-
-    _tile         = &tile;
-    _tile_sampler = tile_sampler;
-
-    return *this;
-}
-
-bool ExampleComponentArgument::has_tensor() const
-{
-    return _tensor != nullptr;
-}
-
-ckw::TensorOperand &ExampleComponentArgument::tensor()
-{
-    CKW_ASSERT(_tensor != nullptr);
-
-    return *_tensor;
-}
-
-const ckw::TensorOperand &ExampleComponentArgument::tensor() const
-{
-    CKW_ASSERT(_tensor != nullptr);
-
-    return *_tensor;
-}
-
-bool ExampleComponentArgument::has_tile() const
-{
-    return _tile != nullptr;
-}
-
-ckw::TileOperand &ExampleComponentArgument::tile()
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return *_tile;
-}
-
-const ckw::TileOperand &ExampleComponentArgument::tile() const
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return *_tile;
-}
-
-ckw::TensorTileSampler &ExampleComponentArgument::tile_sampler()
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return _tile_sampler;
-}
-
-const ckw::TensorTileSampler &ExampleComponentArgument::tile_sampler() const
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return _tile_sampler;
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
deleted file mode 100644
index 0e029b1157..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
-#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
-
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class TensorOperand;
-
-class TileOperand;
-} // namespace ckw
-
-/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
-class ExampleComponentArgument
-{
-public:
-    /** Initialize a new instance of @ref ExampleComponentArgument class for empty virtual tensor. */
-    ExampleComponentArgument();
-
-    /** Initialize a new instance of @ref ExampleComponentArgument class for user tensor.
-     *
-     * @param[in] tensor The user tensor.
-     */
-    explicit ExampleComponentArgument(ckw::TensorOperand &tensor);
-
-    /** Set virtual tensor information (tile, sampler) for the argument.
-     *
-     * If the component is a user tensor, it can be treated as virtual tensor as well
-     * and won't be loaded again using @ref ExampleKernelWriter::op_load_once method.
-     *
-     * @param[in] tile    The tile that has been loaded.
-     * @param[in] sampler The tensor sampling information that has been used to load the tile.
-     */
-    ExampleComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
-
-    /** Get whether the argument is a user tensor. */
-    bool has_tensor() const;
-
-    /** Get the tensor operand.
-     *
-     * If the tensor is not available, throw an error.
-     */
-    ckw::TensorOperand &tensor();
-
-    /** Get the tensor operand.
-     *
-     * If the tensor is not available, throw an error.
-     */
-    const ckw::TensorOperand &tensor() const;
-
-    /** Get whether the argument contains a tile.
-     *
-     * The argument can be either a user tensor that has been loaded,
-     * or a virtual tensor (i.e. a tile with tensor sampling information).
-     */
-    bool has_tile() const;
-
-    /** Get the tile operand.
-     *
-     * If the tile is not available, throw an error.
-     */
-    ckw::TileOperand &tile();
-
-    /** Get the tile operand.
-     *
-     * If the tile is not available, throw an error.
-     */
-    const ckw::TileOperand &tile() const;
-
-    /** Get the tensor sampling information for the tile.
-     *
-     * If the tile is not available, throw an error.
-     */
-    ckw::TensorTileSampler &tile_sampler();
-
-    /** Get the tensor sampling information for the tile.
-     *
-     * If the tile is not available, throw an error.
-     */
-    const ckw::TensorTileSampler &tile_sampler() const;
-
-private:
-    ckw::TensorOperand    *_tensor{nullptr};
-    ckw::TileOperand      *_tile{nullptr};
-    ckw::TensorTileSampler _tile_sampler{};
-};
-
-#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
deleted file mode 100644
index 1734ce8823..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ExampleKernelWriter.h"
-
-#include "ckw/Error.h"
-#include "ckw/TileInfo.h"
-
-#include "ExampleComponentArgument.h"
-
-ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
-{
-}
-
-void ExampleKernelWriter::op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
-{
-    if (!tensor_or_tile->has_tile())
-    {
-        CKW_ASSERT(tensor_or_tile->has_tensor());
-
-        auto &tensor = tensor_or_tile->tensor();
-
-        const auto tile_name = tensor.name() + "_tile";
-        auto      &tile =
-            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
-
-        op_load(tile, tensor, sampler);
-
-        tensor_or_tile->init_virtual_tensor(tile, sampler);
-    }
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
deleted file mode 100644
index 1528c3d933..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
-#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-class ExampleComponentArgument;
-
-namespace ckw
-{
-class Kernel;
-} // namespace ckw
-
-/** Extended implementation of kernel writer for dynamic fusion. */
-class ExampleKernelWriter : public ckw::KernelWriter
-{
-public:
-    /** Initialize a new instance of @ref ExampleKernelWriter class.
-     *
-     * @param[in] kernel The kernel to be generated.
-     */
-    explicit ExampleKernelWriter(ckw::Kernel &kernel);
-
-    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
-     *
-     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
-     * @param[in] sampler        The tensor sampling information to load the tile.
-     */
-    void op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
-};
-
-#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
deleted file mode 100644
index 784d5ffb96..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ExampleScopedKernelWriter.h"
-
-#include "ExampleKernelWriter.h"
-
-ExampleScopedKernelWriter::ExampleScopedKernelWriter(ExampleKernelWriter *writer)
-    : _writer(writer), _parent_id_space(writer->id_space())
-{
-    _writer->next_id_space();
-}
-
-ExampleScopedKernelWriter::ExampleScopedKernelWriter(const ExampleScopedKernelWriter &other)
-    : _writer(other._writer), _parent_id_space(other._writer->id_space())
-{
-    _writer->next_id_space();
-}
-
-ExampleKernelWriter *ExampleScopedKernelWriter::operator->()
-{
-    return _writer;
-}
-
-const ExampleKernelWriter *ExampleScopedKernelWriter::operator->() const
-{
-    return _writer;
-}
-
-ExampleKernelWriter *ExampleScopedKernelWriter::writer()
-{
-    return _writer;
-}
-
-const ExampleKernelWriter *ExampleScopedKernelWriter::writer() const
-{
-    return _writer;
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
deleted file mode 100644
index 4655b1897e..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
-#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
-
-#include <cstdint>
-
-class ExampleKernelWriter;
-
-/** Helper to automatically manage kernel writer ID space. */
-class ExampleScopedKernelWriter
-{
-public:
-    /** Initialize a new instance of @ref ExampleScopedKernelWriter class. */
-    explicit ExampleScopedKernelWriter(ExampleKernelWriter *writer);
-
-    /** Create a new scope from the specified scoped kernel writer. */
-    ExampleScopedKernelWriter(const ExampleScopedKernelWriter &other);
-
-    /** Assignment is disallowed. */
-    ExampleScopedKernelWriter &operator=(const ExampleScopedKernelWriter &) = delete;
-
-    /** Access the underlying kernel writer. */
-    ExampleKernelWriter *operator->();
-
-    /** Access the underlying kernel writer. */
-    const ExampleKernelWriter *operator->() const;
-
-    /** Get the kernel writer. */
-    ExampleKernelWriter *writer();
-
-    /** Get the kernel writer. */
-    const ExampleKernelWriter *writer() const;
-
-private:
-    ExampleKernelWriter *_writer;
-    int32_t              _parent_id_space;
-};
-
-#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/examples/writer_helper.cpp b/compute_kernel_writer/prototype/examples/writer_helper.cpp
deleted file mode 100644
index 8623afbf50..0000000000
--- a/compute_kernel_writer/prototype/examples/writer_helper.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-#include "../include/ckw/KernelWriterHelper.h"
-#include <iostream>
-
-using namespace ckw;
-
-TensorTileSampler create_simple_sampler(KernelWriter &writer)
-{
-    TensorTileSampler sampler;
-
-    constexpr int32_t m0 = 1;
-    constexpr int32_t n0 = 1;
-
-    auto &gid_0 = writer.declare_tile("gid_0", DataType::Int32);
-    auto &gid_1 = writer.declare_tile("gid_1", DataType::Int32);
-    auto &gid_2 = writer.declare_tile("gid_2", DataType::Int32);
-
-    auto &const_0 = writer.declare_tile("0", 0);
-
-    writer.op_get_global_id(gid_0, 0);
-    writer.op_get_global_id(gid_1, 1);
-    writer.op_get_global_id(gid_2, 2);
-
-    sampler.x(gid_0);
-    sampler.y(gid_1);
-    sampler.z(gid_2);
-    sampler.b(const_0);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1);
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-
-    return sampler;
-}
-
-int main()
-{
-    Kernel                           kernel("test", GpuTargetLanguage::OpenCL);
-    KernelWriterHelper<KernelWriter> writer(kernel);
-
-    const TensorInfo src_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 0);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 1);
-
-    auto &src_tensor = writer.declare_tensor_argument("src", src_info);
-    auto &dst_tensor = writer.declare_tensor_argument("dst", dst_info);
-
-    const auto sampler = create_simple_sampler(writer);
-
-    auto &src = writer.declare_tile("src_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-    auto &other =
-        writer.declare_tile("other_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-    auto &dst = writer.declare_tile("dst_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-
-    writer.op_load(src, src_tensor, sampler);
-    writer.op_load(other, src_tensor, sampler);
-    writer.op_load(dst, dst_tensor, sampler);
-
-    auto test       = dst ^ src ^ other;
-    auto other_test = logical_and(dst, src, other);
-    writer.op_assign(dst, logical_and(dst, src, other));
-    writer.op_assign(dst, test);
-    writer.op_assign(dst, other_test);
-    writer.op_assign(dst, operator^(operator^(dst, src), other));
-
-    writer.op_if(exp(src) == dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
-        .op_else_if(exp(src) > dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
-        .op_else([&] { writer.op_assign(dst, src); });
-
-    writer.op_assign(dst, src + src * src);
-    writer.op_assign(dst, src * max(src, dst) + src);
-    writer.op_assign(dst, src * select(src, dst, src) + src);
-
-    writer.op_assign(dst, src ^ dst);
-    writer.op_assign(dst, ~src);
-
-    writer.op_for_loop(dst < src, dst += src, [&] { writer.op_assign(dst, src + dst); });
-
-    writer.op_assign(dst += src);
-    writer.op_assign(dst += exp(src));
-
-    std::cout << "======== KERNEL ========" << std::endl;
-    std::cout << writer.generate_code() << std::endl;
-}
diff --git a/compute_kernel_writer/prototype/include/ckw/Error.h b/compute_kernel_writer/prototype/include/ckw/Error.h
deleted file mode 100644
index aab713c817..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/Error.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_ERROR_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_ERROR_H
-
-#include <stdexcept>
-#include <string>
-
-namespace ckw
-{
-
-/** If the condition is not met, throw an std::runtime_error with the specified message.
- *
- * @param[in] cond The condition that is expected to be true.
- * @param[in] msg  The error message when the condition is not met.
- */
-#define CKW_ASSERT_MSG(cond, msg)            \
-    do                                       \
-    {                                        \
-        if (!(cond))                         \
-        {                                    \
-            throw ::std::runtime_error(msg); \
-        }                                    \
-    } while (false)
-
-/** If the condition is not met, throw an std::runtime_error.
- *
- * @param[in] cond The condition that is expected to be true.
- */
-#define CKW_ASSERT(cond) CKW_ASSERT_MSG(cond, #cond)
-
-/** If the precondition is met but the consequence is not met, throw an std::runtime_error.
- *
- * @param[in] precond The condition if is met requires the consequence must also be met.
- * @param[in] cond    The condition that is expected to be true if the precondition is true.
- */
-#define CKW_ASSERT_IF(precond, cond) CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
-
-/** Mark the variables as unused.
- *
- * @param[in] ... Variables which are unused.
- */
-#define CKW_UNUSED(...) ::ckw::ignore_unused(__VA_ARGS__) // NOLINT
-
-/** Mark the variables as unused.
- *
- * @param[in] ... Variables which are unused.
- */
-template <typename... T>
-inline void ignore_unused(T &&...)
-{
-}
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_ERROR_H
diff --git a/compute_kernel_writer/prototype/include/ckw/Kernel.h b/compute_kernel_writer/prototype/include/ckw/Kernel.h
deleted file mode 100644
index ba31a29ba7..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/Kernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
-
-#include "ckw/KernelArgument.h"
-#include "ckw/OperandBase.h"
-#include "ckw/types/GpuTargetLanguage.h"
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace ckw
-{
-
-class TileOperand;
-
-namespace prototype
-{
-class GpuKernelWriterDataHolder;
-} // namespace prototype
-
-/** The target for kernel writer to write into. */
-class Kernel
-{
-public:
-    /** Constructor
-     *
-     * @param[in] language The programming language to write the kernel.
-     */
-    Kernel(GpuTargetLanguage language);
-    /** Constructor
-     *
-     * @param[in] name     The name of the kernel function.
-     * @param[in] language The programming language to write the kernel.
-     */
-    Kernel(const char *name, GpuTargetLanguage language);
-
-    /** Destructor */
-    ~Kernel();
-
-    /** Get the name of the kernel function. */
-    const std::string &name() const;
-
-    /** Set the name of the kernel function.
-     *
-     * @param[in] name     The name of the kernel function.
-     */
-    void name(const std::string &name);
-
-    /** Get the list of kernel arguments. */
-    ::std::vector<KernelArgument> arguments() const;
-
-    /** (Internal use only) Register the tile operand.
-     *
-     * @param operand The tile operand to be registered.
-     */
-    TileOperand &register_operand(::std::unique_ptr<TileOperand> operand);
-
-    /** (Internal use only) Register the tensor operand.
-     *
-     * @param operand The tensor operand to be registered.
-     */
-    TensorOperand &register_operand(::std::unique_ptr<TensorOperand> operand);
-
-    /** (Internal use only) Get the implementation data. */
-    prototype::GpuKernelWriterDataHolder *impl();
-
-private:
-    ::std::string                                             _name;
-    ::std::unique_ptr<prototype::GpuKernelWriterDataHolder>   _kernel;
-    ::std::map<::std::string, ::std::unique_ptr<OperandBase>> _operands;
-    ::std::map<int32_t, TensorOperand *>                      _tensor_id_operands;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h b/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
deleted file mode 100644
index 3384a20aef..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
-
-#include "ckw/TensorInfo.h"
-
-#include <cstdint>
-
-namespace ckw
-{
-
-class TensorOperand;
-class TensorComponentOperand;
-
-/** A kernel argument which can be either a tensor storage or a tensor component. */
-class KernelArgument
-{
-public:
-    /** The type of kernel argument. */
-    enum class Type : int32_t
-    {
-        /** The argument that provides the read and/or write access to the tensor data.
-         *
-         * See @ref ckw::TensorStorage to see the list of supported storage type.
-         */
-        TensorStorage,
-
-        /** The argument that provides extra information about the tensor.
-         *
-         * See @ref ckw::TensorComponent to see the list of supported component.
-         */
-        TensorComponent,
-    };
-
-    /** Initialize a new instance of kernel argument class for a tensor storage argument.
-     *
-     * @param[in] tensor The tensor whose storage is exposed to kernel arguments.
-     */
-    KernelArgument(TensorOperand &tensor);
-
-    /** Initialize a new instance of kernel argument class for a tensor component argument.
-     *
-     * @param[in] tensor_component The tensor component to be exposed to kernel arguments.
-     */
-    KernelArgument(TensorComponentOperand &tensor_component);
-
-    /** Get the type of kernel argument. */
-    Type type() const;
-
-    /** Get the argument ID.
-     *
-     * This method can be used to get the tensor info ID of both tensor storage and tensor component arguments.
-     */
-    int32_t id() const;
-
-    /** Get the type of tensor storage.
-     *
-     * This method can only be used for tensor storage argument.
-     */
-    TensorStorageType tensor_storage_type() const;
-
-    /** Get the tensor component type.
-     *
-     * This method can only be used for tensor component argument.
-     */
-    TensorComponentType tensor_component_type() const;
-
-private:
-    Type    _type;
-    int32_t _id;
-
-    union SubId
-    {
-        int32_t             unknown;
-        TensorStorageType   tensor_storage_type;
-        TensorComponentType tensor_component_type;
-    };
-
-    SubId _sub_id{0};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
deleted file mode 100644
index f9e0066f91..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
-
-#include "ckw/Kernel.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TileInfo.h"
-#include "ckw/TileOperand.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/Operators.h"
-
-#include <memory>
-
-namespace ckw
-{
-
-namespace prototype
-{
-struct GpuKernelWriterAttribute;
-
-class IGpuKernelWriter;
-} // namespace prototype
-
-/** Kernel writer. */
-class KernelWriter
-{
-public:
-    // =============================================================================================
-    // Constructors and destructor
-    // =============================================================================================
-
-    /** Initialize a new instance of kernel writer.
-     *
-     * @param[in] kernel The kernel to be written to.
-     */
-    explicit KernelWriter(Kernel &kernel);
-
-    /** Destructor */
-    ~KernelWriter();
-
-    /** No copy constructor. */
-    KernelWriter(const KernelWriter &) = delete;
-
-    /** No copy assignment. */
-    KernelWriter &operator=(const KernelWriter &) = delete;
-
-    // =============================================================================================
-    // Scope management
-    // =============================================================================================
-
-    /** Get the current ID space. */
-    int32_t id_space() const;
-
-    /** Set the current ID space. */
-    KernelWriter &id_space(int32_t id_space);
-
-    /** Switch to and return a new ID space. */
-    int32_t next_id_space();
-
-    // =============================================================================================
-    // Tensor and tile declaration
-    // =============================================================================================
-
-    /** Declare a tensor argument.
-     *
-     * @param[in] name         The name of the tensor.
-     * @param[in] info         The tensor info.
-     * @param[in] storage_type The tensor storage type.
-     *
-     * @return The @ref TensorOperand object.
-     */
-    TensorOperand &declare_tensor_argument(const std::string &name,
-                                           const TensorInfo  &info,
-                                           TensorStorageType  storage_type = TensorStorageType::BufferUint8Ptr);
-
-    /** Declare a compile-time constant scalar argument.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     *
-     * @return The @ref TileOperand object.
-     */
-    TileOperand &declare_tile_argument(const std::string &name, int32_t value);
-
-    /** Declare a new tile.
-     *
-     * The name of the tile must be unique in the current ID space.
-     *
-     * @param[in] name The name of the tile.
-     * @param[in] ...  The necessary arguments to create a new @ref TileOperand.
-     *
-     * @return The @ref TileOperand object.
-     */
-    template <typename... TArgs>
-    TileOperand &declare_tile(const std::string &name, TArgs &&...args)
-    {
-        const auto var_name = generate_variable_name(name);
-        auto       operand  = std::make_unique<TileOperand>(var_name, ::std::forward<TArgs>(args)...);
-
-        return declare_tile_operand(std::move(operand));
-    }
-
-    // =============================================================================================
-    // Load and store
-    // =============================================================================================
-
-    /** Load the data from the tensor memory to the tile using the sampling information.
-     *
-     * @param[out] tile       The tile to be loaded.
-     * @param[in]  tensor     The tensor to be read.
-     * @param[in]  sampler    The tensor sampling information.
-     * @param[in]  dilation_y Dilation in the Y dimension.
-     */
-    void op_load(TileOperand             &tile,
-                 const TensorOperand     &tensor,
-                 const TensorTileSampler &sampler,
-                 const TileOperand       &dilation_y = TileOperand("dil_y", 1));
-
-    /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
-     *
-     * @param[out] tile    The tile to be loaded.
-     * @param[in]  tensor  The tensor to be read.
-     * @param[in]  sampler The tensor sampling information.
-     */
-    void op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler);
-
-    /** Construct an indirection buffer in @p tile containing the precalculated addresses of elements in the source tensor.
-     *
-     * @param[out] tile    The tile to be loaded.
-     * @param[in]  tensor  The tensor the be read.
-     * @param[in]  sampler The tensor sampling information.
-     * @param[in]  x       The X coordinate.
-     * @param[in]  y       The Y coordinate.
-     * @param[in]  x_off   Offset in the X dimension.
-     * @param[in]  y_off   Offset in the Y dimension.
-     */
-    void util_get_indirect_buffer(TileOperand             &tile,
-                                  const TensorOperand     &tensor,
-                                  const TensorTileSampler &sampler,
-                                  const TileOperand       &x,
-                                  const TileOperand       &y,
-                                  const TileOperand       &x_off,
-                                  const TileOperand       &y_off);
-
-    /** Store the tile to the tensor using the specified sampling information.
-     *
-     * @param[out] dst     The tensor that the tile is written to.
-     * @param[in]  src     The tile to be stored.
-     * @param[in]  sampler The tensor sampling information.
-     */
-    void op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler);
-
-    // =============================================================================================
-    // Data processing
-    // =============================================================================================
-
-    /** Write assignment: `<dst> = <src>;`.
-     *
-     * @param[out] dst The destination tile.
-     * @param[in]  src The source tile.
-     */
-    void op_assign(const TileOperand &dst, const TileOperand &src);
-
-    /** Write the cast: `<dst> = convert_<dst.type><_sat>(<src>);`.
-     *
-     * @param[out] dst      The destination tile.
-     * @param[in]  src      The source tile.
-     * @param[in]  policy   The policy governing the behavior of the cast.
-     */
-    void op_cast_expression(const TileOperand &dst, const TileOperand &src, ConvertPolicy policy);
-
-    /** Write the unary expression: `<dst> = <op> <src>`.
-     *
-     * @param[out]  dst The destination tile.
-     * @param[in]   op  The unary operator.
-     * @param[in]   src The source tile.
-     */
-    void op_unary_expression(const TileOperand &dst, UnaryOp op, const TileOperand &src);
-
-    /** Write binary expression: `<dst> = <lhs> <op> <rhs>;`.
-     *
-     * @param[out] dst  The destination tile.
-     * @param[in]  lhs  The LHS tile.
-     * @param[in]  op   The binary operator.
-     * @param[in]  rhs  The RHS tile.
-     */
-    void op_binary_expression(const TileOperand &dst, const TileOperand &lhs, BinaryOp op, const TileOperand &rhs);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<src>);`.
-     *
-     * @param[out] dst  The destination tile.
-     * @param[in]  func The function to be applied to the source tile.
-     * @param[in]  src  The source tile.
-     */
-    void op_unary_elementwise_function(const TileOperand &dst, UnaryFunction func, const TileOperand &src);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<first>, <second>);`.
-     *
-     * @param[out] dst      The destination tile.
-     * @param[in]  func     The function to be applied to the source tiles.
-     * @param[in]  first    The first argument tile.
-     * @param[in]  second   The second argument tile.
-     */
-    void op_binary_elementwise_function(const TileOperand &dst,
-                                        BinaryFunction     func,
-                                        const TileOperand &first,
-                                        const TileOperand &second);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<first>, <second>, <third>);`.
-     *
-     * @param[out] dst      The destination tile.
-     * @param[in]  func     The function to be applied to the source tiles.
-     * @param[in]  first    The first argument tile.
-     * @param[in]  second   The second argument tile.
-     * @param[in]  third    The third argument tile.
-     */
-    void op_ternary_elementwise_function(const TileOperand &dst,
-                                         TernaryFunction    func,
-                                         const TileOperand &first,
-                                         const TileOperand &second,
-                                         const TileOperand &third);
-
-    /** Write if-statement: `if(<lhs> <op> <rhs>) { <body> }`.
-     *
-     * @param[in] lhs   The LHS tile of the condition.
-     * @param[in] op    The relational binary operator.
-     * @param[in] rhs   The RHS tile of the condition.
-     * @param[in] body  The body of the if-statement.
-     */
-    void op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body);
-
-    /** Write else-if-statement: `else if(<lhs> <op> <rhs>) { <body> }`.
-     *
-     * @param[in] lhs   The LHS tile of the condition.
-     * @param[in] op    The relational binary operator.
-     * @param[in] rhs   The RHS tile of the condition.
-     * @param[in] body  The body of the else-if-statement.
-     */
-    void op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body);
-
-    /** Write an else-statement: `else { <body> }`.
-     *
-     * @param[in] body The body of the else-statement.
-     */
-    void op_else(const std::function<void()> &body);
-
-    /** Write for-loops: `for(; <var> <cond_op> <cond_value>; <var> <update_op> <update_value>) { body }`.
-     *
-     * @param[in]       var_name          The name of the variable used in condition.
-     * @param[in]       cond_op           The relational binary operator used in condition.
-     * @param[in]       cond_value_name   The value which the variable is compared against.
-     * @param[in]       update_var_name   The name of the variable which is updated.
-     * @param[in]       update_op         The assignment operator used for updating the update value.
-     * @param[in, out]  update_value      The value which is updated at every iteration.
-     * @param[in]       body              The body of the for-loop.
-     */
-    void op_for_loop(const TileOperand           &var_name,
-                     BinaryOp                     cond_op,
-                     const TileOperand           &cond_value_name,
-                     const TileOperand           &update_var_name,
-                     AssignmentOp                 update_op,
-                     const TileOperand           &update_value_name,
-                     const std::function<void()> &body);
-
-    /** Write the return statement: `return;`
-     */
-    void op_return();
-
-    // =============================================================================================
-    // Misc
-    // =============================================================================================
-
-    /** Set `dst` the global ID of dimension `dim`.
-     *
-     * @param[out] dst The tile to be written to.
-     * @param[in]  dim The global ID dimension.
-     */
-    void op_get_global_id(const TileOperand &dst, int32_t dim);
-
-    // =============================================================================================
-    // Code generation
-    // =============================================================================================
-
-    /** Generate the source code of the kernel. */
-    ::std::string generate_code();
-
-private:
-    /** Generate the full variable name based on the original name and the ID space.
-     *
-     * @param[in] name The name of the variable.
-     *
-     * @return The full variable name.
-     */
-    ::std::string generate_variable_name(const std::string &name) const;
-
-    /** Declare the tile operand.
-     *
-     * @param[in] operand   The tile operand to be declared.
-     */
-    TileOperand &declare_tile_operand(std::unique_ptr<TileOperand> operand);
-
-private:
-    Kernel                                                *_kernel;
-    ::std::unique_ptr<prototype::GpuKernelWriterAttribute> _impl_attr;
-    ::std::unique_ptr<prototype::IGpuKernelWriter>         _impl;
-
-    int32_t _id_space{0};
-    int32_t _max_id_space{0};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h b/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
deleted file mode 100644
index 3ba079bbc2..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
+++ /dev/null
@@ -1,1286 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_KERNELWRITERHELPER_H
-#define CKW_INCLUDE_CKW_KERNELWRITERHELPER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TileOperand.h"
-
-#include <iostream>
-#include <type_traits>
-
-/*
- * By including this header file you will be able to supplement the default
- * Compute Kernel Writer API with additional syntax to help ease the use of CKW.
- *
- * To use the KernelWriterHelper you need to wrap your instance of KernelWriter
- * (or any class deriving from KernelWriter):
- *      KernelWriterHelper<KernelWriter> writer;
- * The resulting writer object comprises the original KernelWriter
- * functionality (drop-in replacement), but extends the syntax as follows.
- *
- * Common functions/operators have natural syntax:
- *  1. Unary expressions:
- *          writer.op_assign(dst, !src);        // Logical NOT
- *          writer.op_assign(dst, ~src);        // Bitwise NOT
- *
- *  2. Binary expressions:
- *          writer.op_assign(dst, lhs + rhs);   // Addition
- *          writer.op_assign(dst, lhs - rhs);   // Subtraction
- *          writer.op_assign(dst, lhs * rhs);   // Multiplication
- *          writer.op_assign(dst, lhs / rhs);   // Division
- *          writer.op_assign(dst, lhs % rhs);   // Modulo
- *          writer.op_assign(dst, lhs == rhs);  // Equality
- *          writer.op_assign(dst, lhs < rhs);   // Less-than
- *          writer.op_assign(dst, lhs <= rhs);  // Less-than-or-equal
- *          writer.op_assign(dst, lhs > rhs);   // Greater-than
- *          writer.op_assign(dst, lhs >= rhs);  // Greater-than-or-equal
- *          writer.op_assign(dst, lhs ^ rhs);   // Bitwise XOR
- *          writer.op_assign(dst, logical_and(lhs, rhs));  // Logical AND
- *          writer.op_assign(dst, logical_or(lhs, rhs));   // Logical OR
- *
- *  3. Unary elementwise functions:
- *          writer.op_assign(dst, exp(src));    // Exponent
- *          writer.op_assign(dst, tanh(src));   // Hyperbolic tangent
- *          writer.op_assign(dst, sqrt(src));   // Square root
- *          writer.op_assign(dst, erf(src));    // Error function
- *          writer.op_assign(dst, fabs(src));   // Absolute of floating-point number
- *          writer.op_assign(dst, log(src));    // Natural logarithm
- *          writer.op_assign(dst, round(src));  // Round
- *          writer.op_assign(dst, sizeOf(src)); // sizeof
- *
- *  4. Binary elementwise functions:
- *          writer.op_assign(dst, max(first, second));      // Max
- *          writer.op_assign(dst, min(first, second));      // Min
- *
- *  5. Ternary elementwise functions:
- *          writer.op_assign(dst, select(first, second, third));    // Select
- *
- * NOTE: All the above examples support nesting, so you could write
- * something like: writer.op_assign(dst, src * (log(arg) + sqrt(abs(arg)));
- *
- *
- *  6. If-statements. The preceding syntax also allows easier writing of if-statements:
- *          writer.op_if(<cond>, <body>);
- *
- *     For example:
- *          writer.op_if(exp(first_arg) == dst, [&]{
- *              //...
- *          }).op_else_if(exp(first_arg) > dst, [&]{
- *              //...
- *          }).op_else([&] {
- *              //...
- *          });
- *
- *  7. For-loops. A similar syntax exists for for-loops:
- *          writer.op_for_loop(<cond>, <updater>, <body>);
- *
- *     For example:
- *          writer.op_for_loop(index < limit, index += step, [&]{
- *              //...
- *          });
- *
- * NOTE: There are limitations on the for-loop <cond> and <updater> parameters.
- * In neither the <cond> (Binary expression) or <updater> (Increment/Decrement)
- * is it allowed to use nesting. For example, `(index + other) < limit` and
- * `index < round(limit)` are invalid <cond> parameters. This is because the
- * semantics of for-loops rely on the condition being evaluated at every iteration,
- * but as temporary variables might be defined for nested expressions the semantics
- * cannot be guaranteed.
- */
-
-namespace ckw
-{
-
-// ==================================================
-// Type traits
-// ==================================================
-
-/** Specifies if the type can be used as an operand for functions (e.g. max), operations (e.g. *), or assignments. */
-template <typename T>
-struct can_be_operand : ::std::false_type
-{
-};
-
-/** Specifies if the type can be assigned/written to. */
-template <typename T>
-struct can_be_assigned : ::std::false_type
-{
-};
-
-template <>
-struct can_be_operand<TileOperand &> : ::std::true_type
-{
-};
-
-template <>
-struct can_be_assigned<TileOperand &> : ::std::true_type
-{
-};
-
-// ==================================================
-// Assignment
-// ==================================================
-
-/** AST node for assignments.
- *
- * Note that \p TRight must be an operand, and \p TLeft must be assignable.
- *
- * @tparam TLeft The type of the destination of the assignment.
- * @tparam TRight The type of the source assigned to the destination.
- */
-template <typename TLeft,
-          typename TRight,
-          typename = ::std::enable_if<can_be_operand<TRight>::value && can_be_assigned<TLeft>::value>>
-struct Assignment
-{
-    TLeft        lhs;
-    TRight       rhs;
-    AssignmentOp opcode;
-};
-
-/** Represents the expression: `\p lhs += \p rhs`.
- *
- * @tparam      TLeft    The type of the LHS of the assignment.
- * @tparam      TRight   The type of the RHS of the assignment.
- * @param[in]   lhs      The LHS of the assignment.
- * @param[in]   rhs      The RHS of the assignment.
- * @return      The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline Assignment<TLeft, TRight> operator+=(TLeft &&lhs, TRight &&rhs)
-{
-    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Increment};
-}
-
-/** Represents the expression: `\p lhs -= \p rhs`.
- *
- * @tparam      TLeft    The type of the LHS of the assignment.
- * @tparam      TRight   The type of the RHS of the assignment.
- * @param[in]   lhs    The LHS of the assignment.
- * @param[in]   rhs    The RHS of the assignment.
- * @return      The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline Assignment<TLeft, TRight> operator-=(TLeft &&lhs, TRight &&rhs)
-{
-    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Decrement};
-}
-
-// ==================================================
-// Unary expression
-// ==================================================
-
-/** AST node for unary expressions.
- *
- * Note that \p TSrc must be an operand.
- *
- * @tparam TSrc The type of the argument to the expression.
- */
-template <typename TSrc, typename = ::std::enable_if<can_be_operand<TSrc>::value>>
-struct UnaryExpression
-{
-    TSrc    src;
-    UnaryOp opcode;
-};
-
-template <typename TLeft>
-struct can_be_operand<UnaryExpression<TLeft>> : ::std::true_type
-{
-};
-
-/** Represents the expression: `!\p src`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-inline UnaryExpression<TSrc> operator!(TSrc &&src)
-{
-    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::LogicalNot};
-}
-
-/** Represents the expression: `~\p src`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-inline UnaryExpression<TSrc> operator~(TSrc &&src)
-{
-    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::BitwiseNot};
-}
-
-// ==================================================
-// Binary expressions
-// ==================================================
-
-/** AST node for binary expressions.
- *
- * Note that both \p TLeft and \p TRight must be operands.
- *
- * @tparam TLeft  The type of the left argument of the expression.
- * @tparam TRight The type of the right argument of the expression.
- */
-template <typename TLeft,
-          typename TRight,
-          typename = ::std::enable_if_t<can_be_operand<TLeft>::value && can_be_operand<TRight>::value>>
-struct BinaryExpression
-{
-    TLeft    lhs;
-    TRight   rhs;
-    BinaryOp opcode;
-};
-
-template <typename TLeft, typename TRight>
-struct can_be_operand<BinaryExpression<TLeft, TRight>> : ::std::true_type
-{
-};
-
-/** Represents the expression: `\p lhs + \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator+(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Add};
-}
-
-/** Represents the expression: `\p lhs - \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator-(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Sub};
-}
-
-/** Represents the expression: `\p lhs * \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator*(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mul};
-}
-
-/** Represents the expression: `\p lhs / \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator/(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Div};
-}
-
-/** Represents the expression: `\p lhs % \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator%(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mod};
-}
-
-/** Represents the expression: `\p lhs == \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator==(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Equal};
-}
-
-/** Represents the expression: `\p lhs < \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator<(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Less};
-}
-
-/** Represents the expression: `\p lhs <= \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator<=(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LessEqual};
-}
-
-/** Represents the expression: `\p lhs > \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator>(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Greater};
-}
-
-/** Represents the expression: `\p lhs >= \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator>=(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::GreaterEqual};
-}
-
-/** Represents the expression: `\p lhs ^ \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator^(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::BitwiseXOR};
-}
-
-/** Represents the expression: `\p lhs && \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> logical_and(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd};
-}
-
-/** Represents the expression: `\p lhs && \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight, typename... TOps>
-inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_and(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
-{
-    return logical_and(
-        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd},
-        std::forward<TOps>(ops)...);
-}
-
-/** Represents the expression: `\p lhs || \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> logical_or(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr};
-}
-
-/** Represents the expression: `\p lhs || \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight, typename... TOps>
-inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_or(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
-{
-    return logical_or(
-        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr},
-        std::forward<TOps>(ops)...);
-}
-
-// ==================================================
-// Unary elementwise functions
-// ==================================================
-
-/** AST node for unary elementwise functions.
- *
- * Note that \p TSrc must be an operand.
- *
- * @tparam TSrc The type of the argument to the function.
- */
-template <typename TSrc, typename = ::std::enable_if<can_be_operand<TSrc>::value>>
-struct UnaryElementwiseFunction
-{
-    TSrc          src;
-    UnaryFunction opcode;
-};
-
-template <typename TLeft>
-struct can_be_operand<UnaryElementwiseFunction<TLeft>> : ::std::true_type
-{
-};
-
-/** Represents the expression: `exp(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> exp(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Exp};
-}
-
-/** Represents the expression: `tanh(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> tanh(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Tanh};
-}
-
-/** Represents the expression: `sqrt(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> sqrt(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Sqrt};
-}
-
-/** Represents the expression: `erf(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> erf(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Erf};
-}
-
-/** Represents the expression: `fabs(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> fabs(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Fabs};
-}
-
-/** Represents the expression: `log(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> log(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Log};
-}
-
-/** Represents the expression: `round(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> round(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Round};
-}
-
-/** Represents the expression: `sizeof(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> sizeOf(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::SizeOf};
-}
-
-// ==================================================
-// Binary elementwise functions
-// ==================================================
-
-/** AST node for binary elementwise functions.
- *
- * Note that both \p TFirst and \p TSecond must be operands.
- *
- * @tparam TFirst  The type of the left argument of the function.
- * @tparam TSecond The type of the right argument of the function.
- */
-template <typename TFirst,
-          typename TSecond,
-          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value>>
-struct BinaryElementwiseFunction
-{
-    TFirst         first;
-    TSecond        second;
-    BinaryFunction opcode;
-};
-
-template <typename TFirst, typename TSecond>
-struct can_be_operand<BinaryElementwiseFunction<TFirst, TSecond>> : ::std::true_type
-{
-};
-
-/** Represents the function call: `max(\p first, \p second)`.
- *
- * @tparam      TFirst  The type of the first argument.
- * @tparam      TSecond The type of the second argument.
- * @param[in]   first   The first argument.
- * @param[in]   second  The second argument.
- * @return      The resulting AST node.
- */
-template <typename TFirst, typename TSecond>
-BinaryElementwiseFunction<TFirst, TSecond> max(TFirst &&first, TSecond &&second)
-{
-    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
-                                                      BinaryFunction::Max};
-}
-
-/** Represents the function call: `min(\p first, \p second)`.
- *
- * @tparam      TFirst  The type of the first argument.
- * @tparam      TSecond The type of the second argument.
- * @param[in]   first   The first argument.
- * @param[in]   second  The second argument.
- * @return      The resulting AST node.
- */
-template <typename TFirst, typename TSecond>
-BinaryElementwiseFunction<TFirst, TSecond> min(TFirst &&first, TSecond &&second)
-{
-    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
-                                                      BinaryFunction::Min};
-}
-
-// ==================================================
-// Ternary elementwise functions
-// ==================================================
-
-/** AST node for ternary elementwise functions.
- *
- * Note that \p TFirst, \p TSecond, and \p TThird all must be operands.
- *
- * @tparam TFirst The type of the first argument to the function.
- * @tparam TSecond The type of the second argument to the function.
- * @tparam TThird The type of the third argument to the function.
- */
-template <typename TFirst,
-          typename TSecond,
-          typename TThird,
-          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value &&
-                                      can_be_operand<TThird>::value>>
-struct TernaryElementwiseFunction
-{
-    TFirst          first;
-    TSecond         second;
-    TThird          third;
-    TernaryFunction opcode;
-};
-
-template <typename TFirst, typename TSecond, typename TThird>
-struct can_be_operand<TernaryElementwiseFunction<TFirst, TSecond, TThird>> : ::std::true_type
-{
-};
-
-/** Represents the function call: `select(\p first, \p second, \p third)`.
- *
- * @tparam      TFirst  The type of the first argument.
- * @tparam      TSecond The type of the second argument.
- * @tparam      TThird  The type of the third argument.
- * @param[in]   first   The first argument.
- * @param[in]   second  The second argument.
- * @param[in]   third   The third argument.
- * @return      The resulting AST node.
- */
-template <typename TFirst, typename TSecond, typename TThird>
-TernaryElementwiseFunction<TFirst, TSecond, TThird> select(TFirst &&first, TSecond &&second, TThird &&third)
-{
-    return TernaryElementwiseFunction<TFirst, TSecond, TThird>{std::forward<TFirst>(first),
-                                                               std::forward<TSecond>(second),
-                                                               std::forward<TThird>(third), TernaryFunction::Select};
-}
-
-/** Helper class used to extend a KernelWriter with additional functionality
- * in order to make writing easier.
- *
- * This extension automatically handles creation of temporary variables, and
- * allows nested function calls and operations.
- *
- * @tparam TWriter The type of KernelWriter to be overloaded. This must inherit from KernelWriter.
- */
-template <class TWriter, typename = std::enable_if<std::is_base_of<KernelWriter, TWriter>::value>>
-class KernelWriterHelper : public TWriter
-{
-public:
-    using TWriter::TWriter;
-
-    // ==================================================
-    // If-statements
-    // ==================================================
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_if;
-
-    /** Represents the if-statement: `if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the if-statement.
-     */
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
-                                       const std::function<void()>                          &body)
-    {
-        TWriter::op_if(cond.lhs, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    /** Represents the if-statement: `if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the if-statement.
-     */
-    template <typename TRight>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TRight> &cond,
-                                       const std::function<void()>                   &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
-        op_assign(tmp1, cond.rhs);
-        TWriter::op_if(cond.lhs, cond.opcode, tmp1, body);
-        return *this;
-    }
-
-    /** Represents the if-statement: `if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the if-statement.
-     */
-    template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TLeft, TileOperand &> &cond,
-                                       const std::function<void()>                  &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
-        op_assign(tmp1, cond.lhs);
-        TWriter::op_if(tmp1, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_else_if;
-
-    /** Represents the else-if-statement: `else if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the else-if-statement.
-     */
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
-                                            const std::function<void()>                          &body)
-    {
-        TWriter::op_else_if(cond.lhs, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    /** Represents the else-if-statement: `else if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the else-if-statement.
-     */
-    template <typename TRight>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TRight> &cond,
-                                            const std::function<void()>                   &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
-        op_assign(tmp1, cond.rhs);
-        TWriter::op_else_if(cond.lhs, cond.opcode, tmp1, body);
-        return *this;
-    }
-
-    /** Represents the else-if-statement: `else if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the else-if-statement.
-     */
-    template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TLeft, TileOperand &> &cond,
-                                            const std::function<void()>                  &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
-        op_assign(tmp1, cond.lhs);
-        TWriter::op_else_if(tmp1, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    // ==================================================
-    // For-loops
-    // ==================================================
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_for_loop;
-
-    /** Represents the for-loop: `for(;\p cond; \p updater) { \p body }`.
-     *
-     * The BinaryExpression for the condition and the Assignment
-     * for the updater are unpacked and their components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond    The BinaryExpression representing the condition.
-     * @param[in] updater The Assignment representing the updater.
-     * @param[in] body    The body of the for-loop.
-     */
-    void op_for_loop(const BinaryExpression<TileOperand &, TileOperand &> &cond,
-                     const Assignment<TileOperand &, TileOperand &>       &updater,
-                     const std::function<void()>                          &body)
-    {
-        TWriter::op_for_loop(cond.lhs, cond.opcode, cond.rhs, updater.lhs, updater.opcode, updater.rhs, body);
-    }
-
-    // ==================================================
-    // Unary expressions
-    // ==================================================
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_assign;
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The UnaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The UnaryExpression representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const UnaryExpression<TileOperand &> &exp)
-    {
-        TWriter::op_unary_expression(dst, exp.opcode, exp.src);
-    }
-
-    // ==================================================
-    // Binary expressions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const BinaryExpression<TileOperand &, TileOperand &> &exp)
-    {
-        TWriter::op_binary_expression(dst, exp.lhs, exp.opcode, exp.rhs);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    template <typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryExpression<TileOperand &, TRight> &exp)
-    {
-        std::cout << "Beginning assignment!" << std::endl;
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.rhs);
-        TWriter::op_binary_expression(dst, exp.lhs, exp.opcode, tmp1);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft>
-    void op_assign(const TileOperand &dst, const BinaryExpression<TLeft, TileOperand &> &exp)
-    {
-        std::cout << "Beginning assignment!" << std::endl;
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.lhs);
-        TWriter::op_binary_expression(dst, tmp1, exp.opcode, exp.rhs);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft, typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryExpression<TLeft, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.lhs);
-        op_assign(tmp2, exp.rhs);
-        TWriter::op_binary_expression(dst, tmp1, exp.opcode, tmp2);
-    }
-
-    // ==================================================
-    // Unary elementwise functions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The UnaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The UnaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const UnaryElementwiseFunction<TileOperand &> &exp)
-    {
-        TWriter::op_unary_elementwise_function(dst, exp.opcode, exp.src);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The UnaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The UnaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TArg>
-    void op_assign(const TileOperand &dst, const UnaryElementwiseFunction<TArg> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.lhs);
-        TWriter::op_unary_elementwise_function(dst, exp.opcode, tmp1);
-    }
-
-    // ==================================================
-    // Binary elementwise functions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TileOperand &, TileOperand &> &exp)
-    {
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, exp.first, exp.second);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TileOperand &, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.second);
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, exp.first, tmp1);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft>
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TLeft, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, tmp1, exp.second);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft, typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TLeft, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.second);
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, tmp1, tmp2);
-    }
-
-    // ==================================================
-    // Ternary elementwise functions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand                                                             &dst,
-                   const TernaryElementwiseFunction<TileOperand &, TileOperand &, TileOperand &> &exp)
-    {
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, exp.second, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TileOperand &, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, exp.second, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TSecond>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TSecond, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.second);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, tmp1, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TileOperand &, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, exp.second, tmp1);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst, typename TSecond>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TSecond, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.second);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, tmp2, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst, typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TileOperand &, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, exp.second, tmp2);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TSecond, typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TSecond, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.second);
-        op_assign(tmp2, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, tmp1, tmp2);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst, typename TSecond, typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TSecond, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info(), dst.tile_info(), dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        auto &tmp3 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.second);
-        op_assign(tmp3, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, tmp2, tmp3);
-    }
-
-    // ==================================================
-    // Assignments
-    // ==================================================
-
-    /** Represents the assignment: `\p lhs += \p rhs` or `\p lhs -= \p rhs`.
-     *
-     * The Assignment is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] exp The Assignment representing the expression to be evaluated.
-     */
-    void op_assign(const Assignment<TileOperand &, TileOperand &> &exp)
-    {
-        if (exp.opcode == AssignmentOp::Increment)
-        {
-            TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Add, exp.rhs);
-        }
-        else if (exp.opcode == AssignmentOp::Decrement)
-        {
-            TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Sub, exp.rhs);
-        }
-    }
-
-    /** Represents the assignment: `\p lhs += \p rhs` or `\p lhs -= \p rhs`.
-     *
-     * The Assignment is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @tparam    TRight The type of the RHS of the assignment.
-     * @param[in] exp    The Assignment representing the expression to be evaluated.
-     */
-    template <typename TRight>
-    void op_assign(const Assignment<TileOperand &, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(exp.lhs.tile_info());
-        op_assign(tmp1, exp.rhs);
-        op_assign(Assignment<TileOperand &, TileOperand &>{exp.lhs, tmp1, exp.opcode});
-    }
-
-private:
-    unsigned int temp_var_counter = 0;
-
-    /** Return the current counter value, then increment it.
-     *
-     * @return The current counter value.
-     */
-    int next_ctr()
-    {
-        return temp_var_counter++;
-    }
-
-    /** Gets the next temporary variable counter value,
-     * and returns a suitable temporary variable name.
-     *
-     * @return A temporary variable name.
-     */
-    std::string next_tmp_var_name()
-    {
-        return "tmp_" + std::to_string(next_ctr());
-    }
-
-    /** Returns the argument.
-     *
-     * Used for recursion with the variadic function version of this function.
-     *
-     * @param[in] arg The TileInfo to return.
-     * @return    The \p arg.
-     */
-    TileInfo get_largest_size(const TileInfo &arg)
-    {
-        return arg;
-    }
-
-    /** Returns a TileInfo object where the size in each dimension (width, height) is the largest
-     * of either TileInfo argument in the corresponding dimension.
-     *
-     * @tparam    TOps   Must be of TileInfo type.
-     * @param[in] first  A TileInfo object.
-     * @param[in] second A TileInfo object.
-     * @param[in] ops    A number of TileInfo objects.
-     * @return    A TileInfo object which represents the largest shape in each dimension across the arguments.
-     */
-    template <typename... TOps, typename = ::std::enable_if_t<std::is_same<TOps..., TileInfo>::value>>
-    TileInfo get_largest_size(const TileInfo &first, const TileInfo &second, const TOps &...ops)
-    {
-        TileInfo largest = {first.data_type(), std::max(first.width(), second.width()),
-                            std::max(first.height(), second.height())};
-        return get_largest_size(largest, ops...);
-    }
-
-    /** Helper function to define a suitable TileOperand with appropriate TileInfo
-     * such that broadcasting is taken into account, based on the arguments provided.
-     *
-     * @tparam     TArgs Must be of TileInfo type.
-     * @param[in]  args  A number of TileInfo which determine the shape of the TileOperand to declare.
-     * @return     A newly created TileOperand.
-     */
-    template <typename... TArgs, typename = ::std::enable_if_t<std::is_same<TArgs..., TileInfo>::value>>
-    TileOperand &declare_temp_tile(const TArgs &...args)
-    {
-        return TWriter::declare_tile(next_tmp_var_name().c_str(), get_largest_size(args...));
-    }
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_KERNELWRITERHELPER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/OperandBase.h b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
deleted file mode 100644
index 9842127339..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/OperandBase.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
-
-#include "ckw/types/DataType.h"
-
-#include <string>
-
-namespace ckw
-{
-namespace prototype
-{
-class IGpuKernelWriter;
-
-class Operand;
-} // namespace prototype
-
-/** The base class for all operands. */
-class OperandBase
-{
-public:
-    /** Constructor
-     *
-     * @param[in] name The name of the operand.
-     */
-    explicit OperandBase(const ::std::string &name);
-
-    /** Destructor */
-    virtual ~OperandBase();
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const = 0;
-
-    /** Get the name of the operand. */
-    const ::std::string &name() const;
-
-    /** Set the name of the operand. */
-    OperandBase &name(const ::std::string &name);
-
-    /** Get the data type of the operand. */
-    virtual DataType data_type() const = 0;
-
-    /** Get whether the operand is compile-time constant. */
-    virtual bool is_constant() const = 0;
-
-private:
-    ::std::string _name;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
deleted file mode 100644
index 2a9c42acc8..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
-
-#include "ckw/Error.h"
-
-#include <cstdint>
-
-namespace ckw
-{
-
-/** The scalar value known at compile-time. */
-class ScalarValue
-{
-public:
-    /** Initialize a new instance of @ref ScalarValue class with integer value 0. */
-    ScalarValue()
-    {
-        _type      = Type::INT;
-        _value.i64 = 0;
-    }
-
-    /** Initialize a new instance of @ref ScalarValue class with the specified value. */
-    template <typename T>
-    ScalarValue(T value)
-    {
-        set(value);
-    }
-
-    /** Set the value. */
-    template <typename T>
-    void set(T value)
-    {
-        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
-        CKW_ASSERT(sizeof(T) <= 8);
-
-        _size = sizeof(T);
-
-        if (::std::is_integral<T>::value)
-        {
-            if (::std::is_signed<T>::value)
-            {
-                _type      = Type::INT;
-                _value.i64 = value;
-            }
-            else
-            {
-                _type      = Type::UINT;
-                _value.u64 = value;
-            }
-        }
-        else
-        {
-            _type      = Type::FLOAT;
-            _value.f64 = value;
-        }
-    }
-
-    /** Get the value.
-     *
-     * The caller must make sure that what has been stored in the object must fit
-     * the output data type without data corruption or loss of accuracy.
-     */
-    template <typename T>
-    T get() const
-    {
-        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
-        CKW_ASSERT(sizeof(T) >= _size);
-
-        if (::std::is_integral<T>::value)
-        {
-            if (::std::is_signed<T>::value)
-            {
-                CKW_ASSERT(_type == Type::INT || _type == Type::UINT);
-                CKW_ASSERT_IF(_type == Type::UINT, sizeof(T) > _size);
-
-                return _value.i64;
-            }
-            else
-            {
-                CKW_ASSERT(_type == Type::INT);
-
-                return _value.u64;
-            }
-        }
-        else
-        {
-            return _value.f64;
-        }
-    }
-
-private:
-    union Value
-    {
-        int64_t  i64;
-        uint64_t u64;
-        double   f64;
-    };
-
-    enum class Type : int32_t
-    {
-        UINT,
-        INT,
-        FLOAT,
-    };
-
-    Value    _value{};
-    Type     _type{};
-    uint32_t _size{};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
deleted file mode 100644
index 24da7dc8ab..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H
-
-#include "ckw/types/DataType.h"
-
-#include <array>
-#include <cstdint>
-
-namespace ckw
-{
-/** Compute Kernel Writer tensor data layout (or memory format) */
-enum class TensorDataLayout
-{
-    Unknown,
-    Nhwc,
-    Ndhwc
-};
-
-/** Compute Kernel Writer tensor data layout component */
-enum class TensorDataLayoutComponent
-{
-    Unknown,
-    N,
-    D,
-    H,
-    W,
-    C,
-};
-
-/** Compute Kernel Writer tensor component bitmask. The bitmask can be used to retrieve
- *  the info from @ref TensorComponent.
- */
-enum class TensorComponentBitmask : uint32_t
-{
-    OffsetFirstElement = 0x01000000, // For example, OffsetFirstElement in @ref TensorComponent
-    Stride             = 0x02000000, // For example, stride0 in @ref TensorComponent
-    Dimension          = 0x04000000, // For example, Dim0 in @ref TensorComponent
-    FoldedDimensions   = 0x08000000, // For example, Dim0xDim1 in @ref TensorComponent
-};
-
-/** Compute Kernel Writer tensor component. The tensor components are used to access specific backend-agnostic tensor arguments,
- *  such as the tensor dimensions and tensor strides.
- *  The data type is represented as an integer. The value of the integer value
- *  is assigned to retrieve the information through the @ref TensorComponentBitmask.
- */
-enum class TensorComponentType : uint32_t
-{
-    Unknown            = 0x00000000,
-    OffsetFirstElement = 0x01000000,
-    Stride0            = 0x02000001,
-    Stride1            = 0x02000010,
-    Stride2            = 0x02000100,
-    Stride3            = 0x02001000,
-    Stride4            = 0x02010000,
-    Dim0               = 0x04000001,
-    Dim1               = 0x04000010,
-    Dim2               = 0x04000100,
-    Dim3               = 0x04001000,
-    Dim4               = 0x04010000,
-    Dim1xDim2          = 0x08000110,
-    Dim2xDim3          = 0x08001100,
-    Dim1xDim2xDim3     = 0x08001110
-};
-
-/** Compute Kernel Writer tensor storage. The tensor storage represents the type of tensor memory object.
- */
-enum class TensorStorageType : uint32_t
-{
-    Unknown            = 0x00000000,
-    BufferUint8Ptr     = 0x01000000,
-    Texture2dReadOnly  = 0x02000001,
-    Texture2dWriteOnly = 0x02000010,
-};
-
-/** Compute Kernel Writer tensor shape
- *  Negative dimensions can be interpreted as dynamic dimensions by the Compute Kernel Writer
- */
-using TensorShape = std::array<int32_t, 5>;
-
-/** Compute Kernel Writer tensor info */
-class TensorInfo
-{
-public:
-    /** Constructor
-     *
-     * @param[in] dt    Tensor data type
-     * @param[in] shape Tensor shape
-     * @param[in] dl    Tensor data layout
-     * @param[in] id    Tensor id. The id is used to keep track of the bound user tensor. Through the id,
-     *                  the user can know what tensor has been used by the Compute Kernel Writer.
-     *                  Possible id values:
-     *                  - greater than or equal to 0: bind a user specific tensors
-     *                  - less than 0: bind a virtual tensor (tile)
-     */
-    TensorInfo(DataType dt, const TensorShape &shape, TensorDataLayout dl, int32_t id);
-
-    /** Set shape */
-    TensorInfo &shape(const TensorShape &shape);
-
-    /** Get shape */
-    TensorShape shape() const;
-
-    /** Set data type */
-    TensorInfo &data_type(DataType dt);
-
-    /** Get data type */
-    DataType data_type() const;
-
-    /** Set data layout */
-    TensorInfo &data_layout(TensorDataLayout dl);
-
-    /** Get data layout */
-    TensorDataLayout data_layout() const;
-
-    /** Set id */
-    TensorInfo &id(int32_t id);
-
-    /** Get layout */
-    int32_t id() const;
-
-private:
-    TensorShape      _shape{{0}};
-    DataType         _dt{DataType::Unknown};
-    TensorDataLayout _dl{TensorDataLayout::Unknown};
-    int32_t          _id{-1};
-};
-} // namespace ckw
-
-#endif /* CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H */
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
deleted file mode 100644
index c221b449fa..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
-
-#include "ckw/OperandBase.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-#include "ckw/types/DataType.h"
-
-#include <memory>
-
-namespace ckw
-{
-
-class TensorComponentOperand;
-
-// =================================================================================================
-// TensorOperand
-// =================================================================================================
-
-/** Tensor operand */
-class TensorOperand : public OperandBase
-{
-public:
-    /** Initialize a new instance of @ref TensorOperand class.
-     *
-     * @param[in] name         The name of the tensor.
-     * @param[in] info         The tensor info.
-     * @param[in] storage_type The tensor storage type.
-     */
-    TensorOperand(const ::std::string &name, const TensorInfo &info, TensorStorageType storage_type);
-
-    /** No copy constructor. */
-    TensorOperand(const TensorOperand &other) = delete;
-
-    /** No copy assignment. */
-    TensorOperand &operator=(const TensorOperand &other) = delete;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-    /** Get the tensor info. */
-    const TensorInfo &info() const;
-
-    /** Get the tensor info. */
-    TensorInfo &info();
-
-    /** Get the tensor storage type. */
-    TensorStorageType storage_type() const;
-
-    /** Get the data type. */
-    virtual DataType data_type() const override;
-
-    /** Get whether the tensor is compile-time constant. */
-    virtual bool is_constant() const override;
-
-    /** Get the default tile attached to the tensor. */
-    const TileOperand &tile() const;
-
-    /** Get the default tile attached to the tensor. */
-    TileOperand &tile();
-
-    /** Set the default tile attached to the tensor. */
-    TensorOperand &tile(TileOperand &tile);
-
-    /** Get the tensor sampler of the default tile. */
-    const TensorTileSampler &tile_sampler() const;
-
-    /** Get the tensor sampler of the default tile. */
-    TensorTileSampler &tile_sampler();
-
-    /** Set the tensor sampler of the default tile. */
-    TensorOperand &tile_sampler(const TensorTileSampler &value);
-
-    /** Get the operand that contains the stride in y dimension of the tensor. */
-    TensorComponentOperand &stride1();
-
-    /** Get the operand that contains the stride in z dimension of the tensor. */
-    TensorComponentOperand &stride2();
-
-    /** Get the operand that contains the stride in w dimension of the tensor. */
-    TensorComponentOperand &stride3();
-
-    /** Get the operand that contains the stride in w dimension of the tensor. */
-    TensorComponentOperand &stride4();
-
-    /** Get the operand that contains the size of dimension 0 of the tensor. */
-    TensorComponentOperand &dim0();
-
-    /** Get the operand that contains the size of dimension 1 of the tensor. */
-    TensorComponentOperand &dim1();
-
-    /** Get the operand that contains the size of dimension 2 of the tensor. */
-    TensorComponentOperand &dim2();
-
-    /** Get the operand that contains the size of dimension 3 of the tensor. */
-    TensorComponentOperand &dim3();
-
-    /** Get the operand that contains the size of dimension 4 of the tensor. */
-    TensorComponentOperand &dim4();
-
-    /** Get the operand that contains the size of dimensions 1 and 2 collapsed. */
-    TensorComponentOperand &dim1_dim2();
-
-    /** Get the operand that contains the size of dimensions 1, 2 and 3 collapsed. */
-    TensorComponentOperand &dim1_dim2_dim3();
-
-    /** Get the operand that contains the offset in bytes to the first element. */
-    TensorComponentOperand &offset_first_element_in_bytes();
-
-private:
-    TensorInfo        _info;
-    TensorStorageType _storage_type;
-
-    TileOperand      *_tile{nullptr};
-    TensorTileSampler _tile_sampler{};
-
-    ::std::unique_ptr<TensorComponentOperand> _stride1{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _stride2{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _stride3{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _stride4{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim0{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim1{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim2{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim3{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim4{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{nullptr};
-};
-
-// =================================================================================================
-// TensorComponentOperand
-// =================================================================================================
-
-/** Tile operand that contains tensor information. */
-class TensorComponentOperand : public TileOperand
-{
-public:
-    /** Initialize a new instance of @ref TensorComponentOperand class.
-     *
-     * @param[in] tensor    The tensor operand.
-     * @param[in] component The tensor info component.
-     */
-    TensorComponentOperand(TensorOperand &tensor, TensorComponentType component);
-
-    /** Get the tensor operand. */
-    TensorOperand &tensor();
-
-    /** Get the tensor operand. */
-    const TensorOperand &tensor() const;
-
-    /** Get the tensor component. */
-    TensorComponentType component_type() const;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-private:
-    TensorOperand      &_tensor;
-    TensorComponentType _component;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
deleted file mode 100644
index 606dec3535..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
-
-#include "ckw/types/TensorSamplerTypes.h"
-
-#include <functional>
-
-namespace ckw
-{
-
-class TileOperand;
-
-/** Tensor sampler
- *
- * It contains information about how the result tile should be stored to tensor memory.
- * It can also be used to dictate how the subsequent operators fetch the input tensor.
- */
-class TensorTileSampler
-{
-public:
-    /** Initialize a new instance of @ref TensorSampler class. */
-    TensorTileSampler();
-
-    /** Initialize a new instance of @ref TensorSampler class.
-     *
-     * @param[in] x              The coordinate in the x dimension.
-     * @param[in] y              The coordinate in the y dimension.
-     * @param[in] z              The coordinate in the z dimension.
-     * @param[in] b              The coordinate in the batch dimension.
-     * @param[in] format         The tensor data format.
-     * @param[in] address_mode_x The address mode of the x dimension.
-     * @param[in] address_mode_y The address mode of the y dimension.
-     * @param[in] address_mode_z The address mode of the z dimension.
-     */
-    TensorTileSampler(TileOperand              &x,
-                      TileOperand              &y,
-                      TileOperand              &z,
-                      TileOperand              &b,
-                      TensorSamplerFormat       format,
-                      TensorSamplerAddressModeX address_mode_x,
-                      TensorSamplerAddressModeY address_mode_y,
-                      TensorSamplerAddressModeZ address_mode_z);
-
-    /** Initialize a new instance of @ref TensorSampler class.
-     *
-     * @param[in] x              The coordinate in the x dimension.
-     * @param[in] y              The coordinate in the y dimension.
-     * @param[in] z              The coordinate in the z dimension.
-     * @param[in] b              The coordinate in the batch dimension.
-     * @param[in] height         The height of the tile.
-     * @param[in] width          The width of the tile.
-     * @param[in] format         The tensor data format.
-     * @param[in] address_mode_x The address mode of the x dimension.
-     * @param[in] address_mode_y The address mode of the y dimension.
-     * @param[in] address_mode_z The address mode of the z dimension.
-     */
-    TensorTileSampler(TileOperand              &x,
-                      TileOperand              &y,
-                      TileOperand              &z,
-                      TileOperand              &b,
-                      int32_t                   height,
-                      int32_t                   width,
-                      TensorSamplerFormat       format,
-                      TensorSamplerAddressModeX address_mode_x,
-                      TensorSamplerAddressModeY address_mode_y,
-                      TensorSamplerAddressModeZ address_mode_z);
-
-    /** Get the coordinate in the x dimension. */
-    const TileOperand &x() const;
-
-    /** Set the coordinate in the x dimension. */
-    TensorTileSampler &x(TileOperand &x);
-
-    /** Get the coordinate in the y dimension. */
-    const TileOperand &y() const;
-
-    /** Set the coordinate in the y dimension. */
-    TensorTileSampler &y(TileOperand &y);
-
-    /** Get the coordinate in the z dimension. */
-    const TileOperand &z() const;
-
-    /** Set the coordinate in the z dimension. */
-    TensorTileSampler &z(TileOperand &z);
-
-    /** Get the coordinate in the batch dimension. */
-    const TileOperand &b() const;
-
-    /** Set the coordinate in the batch dimension. */
-    TensorTileSampler &b(TileOperand &b);
-
-    /** Get the width of the tile. */
-    int32_t width() const;
-
-    /** Set the width of the tile. */
-    TensorTileSampler &width(int32_t width);
-
-    /** Get the height of the tile. */
-    int32_t height() const;
-
-    /** Set the height of the tile. */
-    TensorTileSampler &height(int32_t height);
-
-    /** Get the format of the tensor. */
-    TensorSamplerFormat format() const;
-
-    /** Set the format of the tensor. */
-    TensorTileSampler &format(TensorSamplerFormat format);
-
-    /** Get the address mode of the x dimension. */
-    TensorSamplerAddressModeX address_mode_x() const;
-
-    /** Set the address mode of the x-dimension. */
-    TensorTileSampler &address_mode_x(TensorSamplerAddressModeX address_mode_x);
-
-    /** Get the address mode of the y dimension. */
-    TensorSamplerAddressModeY address_mode_y() const;
-
-    /** Set the address mode of the y dimension. */
-    TensorTileSampler &address_mode_y(TensorSamplerAddressModeY address_mode_y);
-
-    /** Get the address mode of the z dimension. */
-    TensorSamplerAddressModeZ address_mode_z() const;
-
-    /** Set the address mode of the z dimension. */
-    TensorTileSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
-
-private:
-    TileOperand *_x{nullptr};
-    TileOperand *_y{nullptr};
-    TileOperand *_z{nullptr};
-    TileOperand *_b{nullptr};
-
-    int32_t _height{0};
-    int32_t _width{0};
-
-    TensorSamplerFormat       _format{TensorSamplerFormat::Unknown};
-    TensorSamplerAddressModeX _address_mode_x{TensorSamplerAddressModeX::Unknown};
-    TensorSamplerAddressModeY _address_mode_y{TensorSamplerAddressModeY::Unknown};
-    TensorSamplerAddressModeZ _address_mode_z{TensorSamplerAddressModeZ::Unknown};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TileInfo.h b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
deleted file mode 100644
index e0d064169e..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TileInfo.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TILEINFO_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TILEINFO_H
-
-#include "ckw/types/DataType.h"
-
-#include <array>
-#include <cstdint>
-
-namespace ckw
-{
-// Constants to access the tile width and height in the TileShape
-constexpr int32_t kTileWidthIdx  = 0;
-constexpr int32_t kTileHeightIdx = 1;
-
-/** Compute Kernel Writer tile shape. It is used to define the shape of the tile */
-using TileShape = std::array<int32_t, 2>;
-
-/** Compute Kernel Writer tile info */
-class TileInfo
-{
-public:
-    /** Constructor used to initialize a scalar variable with a given data type
-     *
-     * @param[in] dt Tile data type
-     */
-    TileInfo(DataType dt);
-
-    /** Constructor used to initialize a vector with a given data type and vector length.
-     *
-     * @param[in] dt Tile data type
-     * @param[in] w  Tile width (or vector length)
-     */
-    TileInfo(DataType dt, int32_t w);
-
-    /** Constructor used to initialize a tile with a given data type and tile sizes.
-     *
-     * @param[in] dt Tile data type
-     * @param[in] h  Tile height
-     * @param[in] w  Tile width
-     */
-    TileInfo(DataType dt, int32_t h, int32_t w);
-
-    /** Set width */
-    TileInfo &width(int32_t w);
-
-    /** Get width */
-    int32_t width() const;
-
-    /** Set height */
-    TileInfo &height(int32_t h);
-
-    /** Get height */
-    int32_t height() const;
-
-    /** Set data type */
-    TileInfo &data_type(DataType dt);
-
-    /** Get data type */
-    DataType data_type() const;
-
-private:
-    DataType  _dt{DataType::Unknown};
-    TileShape _shape{};
-};
-
-} // namespace ckw
-
-#endif /* COMPUTE_KERNEL_WRITER_INCLUDE_CKW_TILEINFO_H */
diff --git a/compute_kernel_writer/prototype/include/ckw/TileOperand.h b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
deleted file mode 100644
index 24ee373a24..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TileOperand.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
-
-#include "ckw/Error.h"
-#include "ckw/OperandBase.h"
-#include "ckw/ScalarValue.h"
-#include "ckw/TileInfo.h"
-
-#include <vector>
-
-namespace ckw
-{
-
-class Kernel;
-
-using TileContainer = std::vector<std::vector<std::string>>;
-
-/** Tile operand which can be either scalar, vector or 2D tile. */
-class TileOperand : public OperandBase
-{
-public:
-    /** Initialize a new instance of @ref TileOperand class with the tile information.
-     *
-     * @param[in] name      The name of the tile.
-     * @param[in] tile_info The tile info.
-     */
-    TileOperand(const ::std::string &name, const TileInfo &tile_info);
-
-    /** Initialize a new instance of @ref TileOperand for scalar variable.
-     *
-     * @param[in] name      The name of the tile.
-     * @param[in] data_type The data type of the tile.
-     */
-    TileOperand(const ::std::string &name, DataType data_type);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, int32_t value);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, float value);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, const ::std::vector<std::vector<std::string>> &value, DataType dt);
-
-    /** Prohibit copy of tile operand. */
-    TileOperand(const TileOperand &) = delete;
-
-    /** Prohibit copy of tile operand. */
-    TileOperand &operator=(const TileOperand &) = delete;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-    /** Get the tile info. */
-    const TileInfo &tile_info() const;
-
-    /** Get the data type of the tile. */
-    virtual DataType data_type() const override;
-
-    /** Get whether the tile is compile-time constant. */
-    virtual bool is_constant() const override;
-
-    /** Get whether the tile is a scalar value. */
-    bool is_scalar() const;
-
-    /** Get the scalar value of the tile.
-     *
-     * The tile must have the shape of 1, 1 (i.e. scalar).
-     *
-     * @return Scalar value as a string.
-     */
-    std::string scalar_value() const;
-
-    /** Get the values of the tile.
-     *
-     * @return 2D container of values.
-     */
-    const TileContainer &value() const;
-
-private:
-    TileInfo      _info;
-    TileContainer _value{};
-    bool          _constant;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/ConvertPolicy.h b/compute_kernel_writer/prototype/include/ckw/types/ConvertPolicy.h
deleted file mode 100644
index 2a198507eb..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/ConvertPolicy.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_CONVERTPOLICY_H
-#define CKW_INCLUDE_CKW_CONVERTPOLICY_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class ConvertPolicy : int32_t
-{
-    None     = 0, // No policy specified.
-    Saturate = 1, // Saturated.
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_CONVERTPOLICY_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/DataType.h b/compute_kernel_writer/prototype/include/ckw/types/DataType.h
deleted file mode 100644
index 3447dd61d6..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/DataType.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef CKW_INCLUDE_CKW_DATATYPE_H
-#define CKW_INCLUDE_CKW_DATATYPE_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-/** Compute Kernel Writer data types. This data type is used by the code variables and tensor arguments. */
-enum class DataType : int32_t
-{
-    Unknown = 0x00,
-    Fp32    = 0x11,
-    Fp16    = 0x12,
-    Int32   = 0x21,
-    Int16   = 0x22,
-    Int8    = 0x24,
-    Uint32  = 0x31,
-    Uint16  = 0x32,
-    Uint8   = 0x34,
-    Bool    = 0x41
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_DATATYPE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Functions.h b/compute_kernel_writer/prototype/include/ckw/types/Functions.h
deleted file mode 100644
index c6afaa0ac8..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/Functions.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef CKW_INCLUDE_CKW_FUNCTIONS_H
-#define CKW_INCLUDE_CKW_FUNCTIONS_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class UnaryFunction : int32_t
-{
-    Exp   = 0x0000,
-    Tanh  = 0x0001,
-    Sqrt  = 0x0002,
-    Erf   = 0x0003,
-    Fabs  = 0x0004,
-    Log   = 0x0006,
-    Round = 0x0007,
-    Floor = 0x0008,
-
-    // Misc
-    SizeOf = 0x0009,
-};
-
-enum class BinaryFunction : int32_t
-{
-    Min = 0x0000,
-    Max = 0x0001,
-};
-
-enum class TernaryFunction : int32_t
-{
-    Select = 0x0000,
-    Clamp  = 0x0001,
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_FUNCTIONS_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/GpuTargetLanguage.h b/compute_kernel_writer/prototype/include/ckw/types/GpuTargetLanguage.h
deleted file mode 100644
index 6c08617949..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/GpuTargetLanguage.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_GPUTARGETLANGUAGE_H
-#define CKW_INCLUDE_CKW_GPUTARGETLANGUAGE_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class GpuTargetLanguage : int32_t
-{
-    Unknown,
-    OpenCL
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_GPUTARGETLANGUAGE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Operators.h b/compute_kernel_writer/prototype/include/ckw/types/Operators.h
deleted file mode 100644
index b560996837..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/Operators.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef CKW_INCLUDE_CKW_OPERATORS_H
-#define CKW_INCLUDE_CKW_OPERATORS_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class UnaryOp : int32_t
-{
-    LogicalNot = 0x0000, // !
-    BitwiseNot = 0x0001, // ~
-    Negate     = 0x0002, // -
-};
-
-/* Binary operations
-*/
-enum class BinaryOp : int32_t
-{
-    // Elementwise
-    Add = 0x0000, // +
-    Sub = 0x0001, // -
-    Mul = 0x0002, // *
-    Div = 0x0003, // /
-    Mod = 0x0004, // %
-    // Relational
-    Equal        = 0x1000, // ==
-    Less         = 0x1001, // <
-    LessEqual    = 0x1002, // <=
-    Greater      = 0x1003, // >
-    GreaterEqual = 0x1004, // >=
-    // Algebra
-    MatMul_Nt_Nt = 0x2000, // X
-    MatMul_Nt_T  = 0x2001, // X
-    MatMul_T_Nt  = 0x2002, // X
-    MatMul_T_T   = 0x2003, // X
-    Dot          = 0x2004, // .
-    // Logical
-    LogicalAnd = 0x3000, // &&
-    LogicalOr  = 0x3001, // ||
-    // Bitwise
-    BitwiseXOR = 0x4000, // ^
-};
-
-enum class AssignmentOp : int32_t
-{
-    // Unary
-    Increment = 0x0000, // +=
-    Decrement = 0x0001, // -=
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_OPERATORS_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h b/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
deleted file mode 100644
index 63405a0764..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
-#define CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class TensorSamplerFormat : int32_t
-{
-    Unknown = 0,
-    C_WH_1  = 1,
-    C_W_H   = 2
-};
-
-enum class TensorSamplerAddressModeX : int32_t
-{
-    Unknown = 0,
-    None    = 1, // The user guarantees that the X coordinate is always in-bound
-    OverlappingMin =
-        2 // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
-};
-
-enum class TensorSamplerAddressModeY : int32_t
-{
-    Unknown = 0,
-    None    = 1, // The user guarantees that the Y coordinate is always in-bound
-    OverlappingMin =
-        2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    Skip = 3, // Skip the read/write
-    SkipMinEdgeOnly =
-        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
-    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-    ClampToBorder      = 9, // Clamp to border which always has 0 value
-    ClampToBorderMinEdgeOnly = 10,
-    ClampToBorderMaxEdgeOnly = 11
-};
-
-enum class TensorSamplerAddressModeZ : int32_t
-{
-    Unknown = 0,
-    None    = 1, // The user guarantees that the Y coordinate is always in-bound
-    Skip    = 3, // Skip the read/write
-    SkipMinEdgeOnly =
-        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
-    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
diff --git a/compute_kernel_writer/prototype/src/Kernel.cpp b/compute_kernel_writer/prototype/src/Kernel.cpp
deleted file mode 100644
index 6228ed17d0..0000000000
--- a/compute_kernel_writer/prototype/src/Kernel.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/Kernel.h"
-
-#include "ckw/TensorOperand.h"
-#include "ckw/types/GpuTargetLanguage.h"
-
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-Kernel::Kernel(GpuTargetLanguage language) : Kernel{"unnamed", language}
-{
-}
-
-Kernel::Kernel(const char *name, GpuTargetLanguage language)
-    : _name(name),
-      _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)),
-      _operands{},
-      _tensor_id_operands{}
-{
-}
-
-Kernel::~Kernel()
-{
-}
-
-const std::string &Kernel::name() const
-{
-    return _name;
-}
-
-void Kernel::name(const std::string &name)
-{
-    _name = name;
-}
-std::vector<KernelArgument> Kernel::arguments() const
-{
-    std::vector<KernelArgument> arguments;
-
-    const auto impl_args = _kernel->arguments.tensor_argument_declarations();
-
-    for (auto tensor_arg : impl_args)
-    {
-        auto tensor = _tensor_id_operands.at(tensor_arg->format().id);
-        arguments.push_back(*tensor);
-
-        for (auto component_arg : tensor_arg->component_declarations())
-        {
-            switch (component_arg)
-            {
-                case TensorComponentType::OffsetFirstElement:
-                    arguments.push_back(tensor->offset_first_element_in_bytes());
-                    break;
-
-                case TensorComponentType::Stride1:
-                    arguments.push_back(tensor->stride1());
-                    break;
-
-                case TensorComponentType::Stride2:
-                    arguments.push_back(tensor->stride2());
-                    break;
-
-                case TensorComponentType::Stride3:
-                    arguments.push_back(tensor->stride3());
-                    break;
-
-                case TensorComponentType::Stride4:
-                    arguments.push_back(tensor->stride4());
-                    break;
-
-                case TensorComponentType::Dim0:
-                    arguments.push_back(tensor->dim0());
-                    break;
-
-                case TensorComponentType::Dim1:
-                    arguments.push_back(tensor->dim1());
-                    break;
-
-                case TensorComponentType::Dim2:
-                    arguments.push_back(tensor->dim2());
-                    break;
-
-                case TensorComponentType::Dim3:
-                    arguments.push_back(tensor->dim3());
-                    break;
-
-                case TensorComponentType::Dim4:
-                    arguments.push_back(tensor->dim4());
-                    break;
-
-                case TensorComponentType::Dim1xDim2:
-                    arguments.push_back(tensor->dim1_dim2());
-                    break;
-
-                case TensorComponentType::Dim1xDim2xDim3:
-                    arguments.push_back(tensor->dim1_dim2_dim3());
-                    break;
-
-                default:
-                    CKW_ASSERT(false);
-            }
-        }
-    }
-
-    return arguments;
-}
-
-TileOperand &Kernel::register_operand(std::unique_ptr<TileOperand> operand)
-{
-    const auto &name = operand->name();
-    auto        ptr  = operand.get();
-
-    CKW_ASSERT(_operands.find(name) == _operands.end());
-    _operands[name] = std::move(operand);
-
-    return *ptr;
-}
-
-TensorOperand &Kernel::register_operand(std::unique_ptr<TensorOperand> operand)
-{
-    const auto  id   = operand->info().id();
-    const auto &name = operand->name();
-    auto        ptr  = operand.get();
-
-    CKW_ASSERT(_tensor_id_operands.find(id) == _tensor_id_operands.end());
-    CKW_ASSERT(_operands.find(name) == _operands.end());
-
-    _tensor_id_operands[id] = operand.get();
-    _operands[name]         = std::move(operand);
-
-    return *ptr;
-}
-
-prototype::GpuKernelWriterDataHolder *Kernel::impl()
-{
-    return _kernel.get();
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelArgument.cpp b/compute_kernel_writer/prototype/src/KernelArgument.cpp
deleted file mode 100644
index 24ace28eb3..0000000000
--- a/compute_kernel_writer/prototype/src/KernelArgument.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/KernelArgument.h"
-
-#include "ckw/Error.h"
-#include "ckw/TensorOperand.h"
-
-namespace ckw
-{
-
-KernelArgument::KernelArgument(TensorOperand &tensor) : _type(Type::TensorStorage), _id(tensor.info().id())
-{
-    _sub_id.tensor_storage_type = tensor.storage_type();
-}
-
-KernelArgument::KernelArgument(TensorComponentOperand &tensor_component)
-    : _type(Type::TensorComponent), _id(tensor_component.tensor().info().id())
-{
-    _sub_id.tensor_component_type = tensor_component.component_type();
-}
-
-KernelArgument::Type KernelArgument::type() const
-{
-    return _type;
-}
-
-int32_t KernelArgument::id() const
-{
-    return _id;
-}
-
-TensorStorageType KernelArgument::tensor_storage_type() const
-{
-    CKW_ASSERT(_type == Type::TensorStorage);
-    return _sub_id.tensor_storage_type;
-}
-
-TensorComponentType KernelArgument::tensor_component_type() const
-{
-    CKW_ASSERT(_type == Type::TensorComponent);
-    return _sub_id.tensor_component_type;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
deleted file mode 100644
index 9f58d9fefa..0000000000
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/KernelWriter.h"
-
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorOperand.h"
-
-#include "src/Prototype.h"
-
-#include <sstream>
-
-namespace ckw
-{
-
-namespace
-{
-
-inline prototype::TensorInfo create_impl_tensor_info(const TensorInfo &info)
-{
-    return prototype::TensorInfo{info.shape(), info.data_type(), info.data_layout(), info.id()};
-}
-
-} // namespace
-
-// =================================================================================================
-// Constructors and destructor
-// =================================================================================================
-
-KernelWriter::KernelWriter(Kernel &kernel)
-    : _kernel(&kernel),
-      _impl_attr(std::make_unique<prototype::GpuKernelWriterAttribute>()),
-      _impl(prototype::GpuKernelWriterFactory::create(_impl_attr.get(), kernel.impl()))
-{
-    _impl->set_IdSpace(1);
-}
-
-KernelWriter::~KernelWriter()
-{
-}
-
-// =================================================================================================
-// Scope management
-// =================================================================================================
-
-int32_t KernelWriter::id_space() const
-{
-    return _id_space;
-}
-
-KernelWriter &KernelWriter::id_space(int32_t id_space)
-{
-    CKW_ASSERT(id_space <= _max_id_space);
-
-    _id_space = id_space;
-    return *this;
-}
-
-int32_t KernelWriter::next_id_space()
-{
-    id_space(++_max_id_space);
-    return _id_space;
-}
-
-// =================================================================================================
-// Tensor and tile declaration
-// =================================================================================================
-
-TensorOperand &
-KernelWriter::declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
-{
-    const auto var_name = generate_variable_name(name);
-
-    _impl->declare_argument(var_name, create_impl_tensor_info(info));
-
-    auto &operand = _kernel->register_operand(std::make_unique<TensorOperand>(var_name, info, storage_type));
-
-    return operand;
-}
-
-TileOperand &KernelWriter::declare_tile_argument(const std::string &name, int32_t value)
-{
-    const auto var_name = generate_variable_name(name);
-
-    auto &operand = _kernel->register_operand(std::make_unique<TileOperand>(var_name, value));
-
-    return operand;
-}
-
-std::string KernelWriter::generate_variable_name(const std::string &name) const
-{
-    std::stringstream var_name;
-
-    var_name << "_" << _id_space << "_" << name;
-
-    return var_name.str();
-}
-
-TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> operand_ptr)
-{
-    auto       &operand = _kernel->register_operand(std::move(operand_ptr));
-    const auto &name    = operand.name();
-
-    if (!operand.is_constant())
-    {
-        const auto &info = operand.tile_info();
-
-        _impl->declare_tile(name, prototype::TileInfo(info.data_type(), info.width(), info.height()));
-    }
-    else
-    {
-        _impl->declare_const_tile(name, operand.value(), operand.data_type());
-    }
-
-    return operand;
-}
-
-// =================================================================================================
-// Load and store
-// =================================================================================================
-
-void KernelWriter::op_load(TileOperand             &tile,
-                           const TensorOperand     &tensor,
-                           const TensorTileSampler &sampler,
-                           const TileOperand       &dilation_y)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-
-    auto impl_x = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b = sampler.b().create_impl_operand(_impl.get());
-
-    auto impl_dilation_y = dilation_y.create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b, impl_dilation_y);
-}
-
-void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-
-    auto impl_x = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b = sampler.b().create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->op_load_indirect(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
-}
-
-void KernelWriter::util_get_indirect_buffer(TileOperand             &tile,
-                                            const TensorOperand     &tensor,
-                                            const TensorTileSampler &sampler,
-                                            const TileOperand       &x,
-                                            const TileOperand       &y,
-                                            const TileOperand       &x_off,
-                                            const TileOperand       &y_off)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-
-    auto impl_x     = x.create_impl_operand(_impl.get());
-    auto impl_y     = y.create_impl_operand(_impl.get());
-    auto impl_x_off = x_off.create_impl_operand(_impl.get());
-    auto impl_y_off = y_off.create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->util_get_indirect_buffer(impl_dst, impl_tensor, impl_x, impl_y, impl_x_off, impl_y_off);
-}
-
-void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-    auto impl_src = tile.create_impl_operand(_impl.get());
-    auto impl_x   = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y   = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z   = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b   = sampler.b().create_impl_operand(_impl.get());
-
-    _impl->op_store_immediate(impl_tensor, impl_src, impl_x, impl_y, impl_z, impl_b);
-}
-
-// =================================================================================================
-// Data processing
-// =================================================================================================
-
-void KernelWriter::op_assign(const TileOperand &dst, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_assign(impl_dst, impl_src);
-}
-
-void KernelWriter::op_cast_expression(const TileOperand &dst, const TileOperand &src, const ConvertPolicy policy)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_cast_expression(impl_dst, impl_src, policy);
-}
-
-void KernelWriter::op_binary_expression(const TileOperand &dst,
-                                        const TileOperand &lhs,
-                                        BinaryOp           op,
-                                        const TileOperand &rhs)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-
-    _impl->op_binary_expression(impl_dst, impl_lhs, op, impl_rhs);
-}
-
-void KernelWriter::op_unary_expression(const TileOperand &dst, UnaryOp op, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_unary_expression(impl_dst, op, impl_src);
-}
-
-void KernelWriter::op_unary_elementwise_function(const TileOperand &dst, UnaryFunction opcode, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_unary_elementwise_function(impl_dst, opcode, impl_src);
-}
-
-void KernelWriter::op_binary_elementwise_function(const TileOperand &dst,
-                                                  BinaryFunction     opcode,
-                                                  const TileOperand &first,
-                                                  const TileOperand &second)
-{
-    auto impl_dst    = dst.create_impl_operand(_impl.get());
-    auto impl_first  = first.create_impl_operand(_impl.get());
-    auto impl_second = second.create_impl_operand(_impl.get());
-
-    _impl->op_binary_elementwise_function(impl_dst, opcode, impl_first, impl_second);
-}
-
-void KernelWriter::op_ternary_elementwise_function(const TileOperand &dst,
-                                                   TernaryFunction    opcode,
-                                                   const TileOperand &first,
-                                                   const TileOperand &second,
-                                                   const TileOperand &third)
-{
-    auto impl_dst    = dst.create_impl_operand(_impl.get());
-    auto impl_first  = first.create_impl_operand(_impl.get());
-    auto impl_second = second.create_impl_operand(_impl.get());
-    auto impl_third  = third.create_impl_operand(_impl.get());
-
-    _impl->op_ternary_elementwise_function(impl_dst, opcode, impl_first, impl_second, impl_third);
-}
-
-void KernelWriter::op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-
-    _impl->op_if_header(impl_lhs, op, impl_rhs);
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-void KernelWriter::op_else_if(const TileOperand           &lhs,
-                              BinaryOp                     op,
-                              const TileOperand           &rhs,
-                              const std::function<void()> &body)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-
-    _impl->op_else_if_header(impl_lhs, op, impl_rhs);
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-void KernelWriter::op_else(const std::function<void()> &body)
-{
-    _impl->op_else_header();
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-void KernelWriter::op_for_loop(const TileOperand           &var_name,
-                               BinaryOp                     cond_op,
-                               const TileOperand           &cond_value_name,
-                               const TileOperand           &update_var_name,
-                               AssignmentOp                 update_op,
-                               const TileOperand           &update_value_name,
-                               const std::function<void()> &body)
-{
-    auto impl_var_name          = var_name.create_impl_operand(_impl.get());
-    auto impl_cond_value_name   = cond_value_name.create_impl_operand(_impl.get());
-    auto impl_update_var_name   = update_var_name.create_impl_operand(_impl.get());
-    auto impl_update_value_name = update_value_name.create_impl_operand(_impl.get());
-
-    _impl->op_for_loop_header(impl_var_name, cond_op, impl_cond_value_name, impl_update_var_name, update_op,
-                              impl_update_value_name);
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-// =================================================================================================
-// Misc
-// =================================================================================================
-
-void KernelWriter::op_get_global_id(const TileOperand &dst, int32_t dim)
-{
-    _impl->op_get_global_id(prototype::Operand(dst.name()), dim);
-}
-
-void KernelWriter::op_return()
-{
-    _impl->op_return();
-}
-
-// =================================================================================================
-// Code generation
-// =================================================================================================
-
-std::string KernelWriter::generate_code()
-{
-    return prototype::generate_code(*_kernel->impl(), _kernel->name());
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/OperandBase.cpp b/compute_kernel_writer/prototype/src/OperandBase.cpp
deleted file mode 100644
index e0617fdc06..0000000000
--- a/compute_kernel_writer/prototype/src/OperandBase.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/OperandBase.h"
-
-namespace ckw
-{
-
-OperandBase::OperandBase(const std::string &name) : _name(name)
-{
-}
-
-OperandBase::~OperandBase()
-{
-}
-
-const std::string &OperandBase::name() const
-{
-    return _name;
-}
-
-OperandBase &OperandBase::name(const std::string &name)
-{
-    _name = name;
-    return *this;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
deleted file mode 100644
index b392fe2651..0000000000
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ /dev/null
@@ -1,4189 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
-#define CKW_PROTOTYPE_SRC_PROTOTYPE_H
-
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/DataType.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/GpuTargetLanguage.h"
-#include "ckw/types/Operators.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
-#include <algorithm>
-#include <array>
-#include <cassert> // assert (to be removed)
-#include <chrono>
-#include <cmath>
-#include <cstdint> // int32_t
-#include <functional>
-#include <iostream> // cout (to be removed)
-#include <map>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace ckw
-{
-namespace prototype
-{
-
-// Dummy data structure for Size2D
-using Size2D = std::vector<int32_t>;
-
-// Dummy Status
-using Status = void;
-
-enum class ComponentType : int32_t
-{
-    Complex   = 0,
-    Simple    = 1,
-    Unfusable = 2
-};
-
-enum class GpuCompilationSpeed
-{
-    Fast = 0x00, // fast compilation may increase the latency of the network
-    Slow = 0x01  // slow compilation may decrease the latency of the network
-};
-
-enum class GpuExtensions
-{
-    Fp16,
-    Dot8,
-    Mmul,
-    FastMath
-};
-
-struct TensorInfo
-{
-    TensorShape      shape{{0}};
-    DataType         data_type{DataType::Unknown};
-    TensorDataLayout data_layout{TensorDataLayout::Nhwc};
-    int32_t          id{-1};
-};
-
-struct ComponentAttribute
-{
-    GpuCompilationSpeed compilation_speed{GpuCompilationSpeed::Fast};
-    bool                overwrite_tile{true};
-};
-
-inline std::string data_type_to_cl_type(DataType dt)
-{
-    switch (dt)
-    {
-        case DataType::Fp32:
-            return "float";
-        case DataType::Fp16:
-            return "half";
-        case DataType::Int8:
-            return "char";
-        case DataType::Uint8:
-            return "uchar";
-        case DataType::Uint16:
-            return "ushort";
-        case DataType::Int16:
-            return "short";
-        case DataType::Uint32:
-            return "uint";
-        case DataType::Int32:
-            return "int";
-        case DataType::Bool:
-            return "bool";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline int32_t width_to_cl_vector_size(int32_t width)
-{
-    switch (width)
-    {
-        case 1:
-            return 1;
-        case 2:
-            return 2;
-        case 3:
-            return 3;
-        case 4:
-            return 4;
-        case 5:
-        case 6:
-        case 7:
-        case 8:
-            return 8;
-        case 9:
-        case 10:
-        case 11:
-        case 12:
-        case 13:
-        case 14:
-        case 15:
-        case 16:
-            return 16;
-        default:
-            assert(false);
-            return 0;
-    }
-}
-
-inline std::string get_cl_data_type(DataType dt, int32_t width)
-{
-    std::string data_type;
-    int32_t     w = width_to_cl_vector_size(width);
-    data_type += data_type_to_cl_type(dt);
-    if (w != 1)
-    {
-        data_type += std::to_string(w);
-    }
-    return data_type;
-}
-
-inline std::string to_opencl_store(int32_t vector_length)
-{
-    if (vector_length != 1)
-    {
-        return "vstore" + std::to_string(vector_length) + "(";
-    }
-    else
-    {
-        return "*(";
-    }
-}
-
-struct TileInfo
-{
-    TileInfo()
-    {
-    }
-
-    TileInfo(DataType dt) : dt(dt), w(1), h(1)
-    {
-    }
-
-    TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1)
-    {
-    }
-
-    TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height)
-    {
-    }
-
-    DataType dt{DataType::Unknown}; // Data type of the tile
-    int32_t  w{0};                  // Width (i.e. c0 - portion of the channels)
-    int32_t  h{0};                  // Height (i.e. s0 - portion of the spatial dimensions)
-};
-
-inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
-{
-    o << a.w << " x " << a.h;
-    return o;
-}
-
-struct DataTypeAsString
-{
-    std::string str{""};
-    DataType    dt{DataType::Unknown};
-    int32_t     size{1};
-};
-
-struct ValueAsString
-{
-    std::string      str{""};
-    DataTypeAsString type{};
-};
-
-// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
-// A Tile is a collection of variables used to express a 2D data.
-class IScalarTile
-{
-public:
-    virtual ~IScalarTile() = default;
-
-    /** Method to get the scalar variable from a tile
-     * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     *
-     * @return the scalar variable as a string
-     */
-    virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
-
-    /** Method to get the list of underlying variable names used by the tile
-     *
-     * @return the list of variable names
-     */
-    virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
-
-    /** Method to get the name of the tile.
-     *
-     * @return the name of the tile
-     */
-    std::string name() const
-    {
-        return _basename;
-    }
-
-    /** Method to get the tile format
-     *
-     * @return the format
-     */
-    TileInfo format() const
-    {
-        return _format;
-    }
-
-    /** Method to know whether the tile is assignable or not (constant)
-     *
-     * @return true if the tile is assignable
-     */
-    virtual bool is_assignable() const = 0;
-
-    /** Method to know whether the tile needs to be declared
-     *
-     * @return true if the tile needs to be declared in the code before being used
-     */
-    virtual bool need_declaration() const = 0;
-
-protected:
-    TileInfo    _format{};     // Tile format
-    std::string _basename{""}; // Tile name
-};
-
-// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
-// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
-class IVectorTile : public IScalarTile
-{
-public:
-    virtual ~IVectorTile() = default;
-
-    /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
-     *  The user can query the list of supported width for the vectors through preferred_vector_sizes().
-     *
-     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     *
-     * @return the vector variable as a string
-     */
-    virtual ValueAsString vector(int32_t y) const = 0;
-
-    /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
-     *
-     * @return the vector variable as a string
-     */
-    virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
-    /** Method to get the preferred vector sizes.
-     *
-     * @return a vector with the preferred vector sizes
-     */
-    //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
-};
-
-class ClTile : public IVectorTile
-{
-public:
-    ClTile(const std::string &name, TileInfo format)
-    {
-        _format   = format;
-        _basename = name;
-    }
-
-    ValueAsString scalar(int32_t x, int32_t y) const override
-    {
-        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, 1);
-        t.type.dt   = _format.dt;
-        t.type.size = 1;
-
-        // Check required because if the width has only one element, we cannot use .s0
-        if (_format.w != 1)
-        {
-            // Automatic broadcasting
-            t.str += ".s" + std::to_string(x);
-        }
-
-        return t;
-    }
-
-    ValueAsString vector(int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, _format.w);
-        t.type.dt   = _format.dt;
-        t.type.size = _format.w;
-        return t;
-    }
-
-    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, width);
-        t.type.dt   = _format.dt;
-        t.type.size = width;
-
-        if (_format.w != 1)
-        {
-            t.str += ".s";
-            for (int i = 0; i < width; ++i)
-            {
-                t.str += to_scalar_hex(x_start + i);
-            }
-        }
-        return t;
-    }
-
-    std::vector<ValueAsString> underlying_source_variables() const override
-    {
-        std::vector<ValueAsString> vars;
-        for (int32_t y = 0; y < _format.h; ++y)
-        {
-            ValueAsString t;
-            t.str       = build_variable_name(y);
-            t.type.str  = get_cl_data_type(_format.dt, _format.w);
-            t.type.dt   = _format.dt;
-            t.type.size = _format.w;
-            vars.push_back(t);
-        }
-        return vars;
-    }
-
-    bool is_assignable() const override
-    {
-        return true;
-    }
-
-    bool need_declaration() const override
-    {
-        return true;
-    }
-
-private:
-    std::string build_variable_name(int32_t y) const
-    {
-        std::string var_name = _basename;
-
-        if (_format.h == 1)
-        {
-            return var_name;
-        }
-        else
-        {
-            var_name += "_";
-            var_name += std::to_string(y);
-        }
-
-        return var_name;
-    }
-
-    std::string to_scalar_hex(int32_t x) const
-    {
-        switch (x)
-        {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-            case 8:
-            case 9:
-                return std::to_string(x);
-            case 10:
-                return "A";
-            case 11:
-                return "B";
-            case 12:
-                return "C";
-            case 13:
-                return "D";
-            case 14:
-                return "E";
-            case 15:
-                return "F";
-            default:
-                std::cout << "Unsupported hexadecimal value" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-};
-
-// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
-class ClConstantTile : public IVectorTile
-{
-public:
-    ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        _format.w  = in[0].size();
-        _format.h  = in.size();
-        _format.dt = dt;
-
-        _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
-
-        for (int32_t y = 0; y < _format.h; ++y)
-        {
-            for (int32_t x = 0; x < _format.w; ++x)
-            {
-                _data[y][x] = in[y][x];
-            }
-        }
-    }
-
-    ValueAsString scalar(int32_t x, int32_t y) const override
-    {
-        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = _data[y][x];
-        t.type.str  = get_cl_data_type(_format.dt, 1);
-        t.type.dt   = _format.dt;
-        t.type.size = 1;
-
-        return t;
-    }
-
-    ValueAsString vector(int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        return vector(0, _format.w, y);
-    }
-
-    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = "";
-        t.type.str  = get_cl_data_type(_format.dt, width);
-        t.type.dt   = _format.dt;
-        t.type.size = width;
-
-        if (width > 1)
-        {
-            t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
-        }
-
-        int32_t x = x_start;
-        for (; x < width - 1; ++x)
-        {
-            t.str += scalar(x, y).str;
-            t.str += ", ";
-        }
-        t.str += scalar(x, y).str;
-
-        if (width > 1)
-        {
-            t.str += "))";
-        }
-
-        return t;
-    }
-
-    std::vector<ValueAsString> underlying_source_variables() const override
-    {
-        std::vector<ValueAsString> vars;
-
-        for (int32_t y = 0; y < _format.h; ++y)
-        {
-            for (int32_t x = 0; x < _format.w; ++x)
-            {
-                ValueAsString t;
-                t.str       = _data[y][x];
-                t.type.str  = get_cl_data_type(_format.dt, 1);
-                t.type.dt   = _format.dt;
-                t.type.size = 1;
-                vars.push_back(t);
-            }
-        }
-
-        return vars;
-    }
-
-    bool is_assignable() const override
-    {
-        return false;
-    }
-
-    bool need_declaration() const override
-    {
-        return false;
-    }
-
-private:
-    std::vector<std::vector<std::string>> _data{};
-};
-
-enum class TensorComponentIndex : int32_t
-{
-    IndexMask = 0x0000000f,
-};
-
-enum class TensorComponentGroup : int32_t
-{
-    OffsetFirstElement = 0x00000100,
-    Stride             = 0x00001000,
-    Dimension          = 0x00010000,
-    FoldedDimension    = 0x00100000,
-    Constant           = 0x01000000
-};
-
-inline std::string to_string(TensorComponentType x)
-{
-    switch (x)
-    {
-        case TensorComponentType::Unknown:
-            return "Unknown";
-        case TensorComponentType::OffsetFirstElement:
-            return "OffsetFirstElement";
-        case TensorComponentType::Stride1:
-            return "Stride1";
-        case TensorComponentType::Stride2:
-            return "Stride2";
-        case TensorComponentType::Stride3:
-            return "Stride3";
-        case TensorComponentType::Stride4:
-            return "Stride4";
-        case TensorComponentType::Dim0:
-            return "Dim0";
-        case TensorComponentType::Dim1:
-            return "Dim1";
-        case TensorComponentType::Dim2:
-            return "Dim2";
-        case TensorComponentType::Dim3:
-            return "Dim3";
-        case TensorComponentType::Dim4:
-            return "Dim4";
-        case TensorComponentType::Dim1xDim2:
-            return "Dim1xDim2";
-        case TensorComponentType::Dim1xDim2xDim3:
-            return "Dim1xDim2xDim3";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-class ITensorArgument
-{
-public:
-    virtual ~ITensorArgument() = default;
-
-    /** Method to get the tensor component as a string
-     *
-     * @param[in] x tensor component to query
-     *
-     * @return  the tensor component as a string
-     */
-    virtual std::string component(TensorComponentType x) = 0;
-
-    /** Method to get the tensor component type declaration as a string
-     *
-     * @return  the tensor component type declaration as a string
-     */
-    virtual std::string component_type_declaration() const = 0;
-
-    /** Method to get the tensor component data type
-     *
-     * @return  the tensor component data type
-     */
-    virtual DataType component_data_type() const = 0;
-
-    /** Method to get the tensor component declarations
-     *
-     * @return a vector containing the tensor component declarations
-     */
-    virtual std::vector<TensorComponentType> component_declarations() const = 0;
-
-    /** Method to get the name of the tensor argument.
-     *
-     * @return the name of the tensor argument
-     */
-    std::string name() const
-    {
-        return _basename;
-    }
-
-    /** Method to get the tensor format
-     *
-     * @return the format
-     */
-    TensorInfo format() const
-    {
-        return _format;
-    }
-
-protected:
-    TensorInfo  _format{};
-    std::string _basename{};
-};
-
-enum class GpuTensorStorage : int32_t
-{
-    Unknown          = 0x0000,
-    BufferUint8Ptr   = 0x0012,
-    Image2dReadOnly  = 0x0020,
-    Image2dWriteOnly = 0x0021,
-    Image3dReadOnly  = 0x0030,
-    Image3dWriteOnly = 0x0031
-};
-
-inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
-{
-    switch (s)
-    {
-        case TensorStorageType::Unknown:
-            return GpuTensorStorage::Unknown;
-
-        case TensorStorageType::BufferUint8Ptr:
-            return GpuTensorStorage::BufferUint8Ptr;
-
-        case TensorStorageType::Texture2dReadOnly:
-            return GpuTensorStorage::Image2dReadOnly;
-
-        case TensorStorageType::Texture2dWriteOnly:
-            return GpuTensorStorage::Image2dWriteOnly;
-
-        default:
-            assert(false);
-            return GpuTensorStorage::Unknown;
-    }
-}
-
-inline TensorStorageType to_tensor_storage(GpuTensorStorage s)
-{
-    switch (s)
-    {
-        case GpuTensorStorage::Unknown:
-            return TensorStorageType::Unknown;
-
-        case GpuTensorStorage::BufferUint8Ptr:
-            return TensorStorageType::BufferUint8Ptr;
-
-        case GpuTensorStorage::Image2dReadOnly:
-            return TensorStorageType::Texture2dReadOnly;
-
-        case GpuTensorStorage::Image2dWriteOnly:
-            return TensorStorageType::Texture2dWriteOnly;
-
-        default:
-            assert(false);
-            return TensorStorageType::Unknown;
-    }
-}
-
-class IGpuTensorArgument : public ITensorArgument
-{
-public:
-    virtual ~IGpuTensorArgument() = default;
-
-    /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
-     *
-     * @param[in] x tensor storage to query
-     *
-     * @return  the tensor storage as a string
-     */
-    virtual std::string storage(GpuTensorStorage x) = 0;
-
-    /** Method to get the tensor storage type declaration as a string
-     *
-     * @param[in] x tensor component to query
-     *
-     * @return  the tensor storage type declaration as a string
-     */
-    virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
-
-    /** Method to get the tensor storage declarations
-     *
-     * @return a vector containing the tensor storage declarations
-     */
-    virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
-};
-
-class ClTensorArgument : public IGpuTensorArgument
-{
-public:
-    ClTensorArgument(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible)
-    {
-        _basename                      = name;
-        _format                        = x;
-        _return_by_value_when_possible = return_by_value_when_possible;
-    }
-
-    // Methods to override
-    std::string component(TensorComponentType x) override
-    {
-        if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
-        {
-            int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
-            return std::to_string(idx - 1);
-        }
-
-        if (_return_by_value_when_possible)
-        {
-            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
-            {
-                int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
-                return std::to_string(_format.shape[idx]);
-            }
-
-            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
-            {
-                switch (x)
-                {
-                    case TensorComponentType::Dim1xDim2:
-                        return std::to_string(_format.shape[1] * _format.shape[2]);
-                    case TensorComponentType::Dim1xDim2xDim3:
-                        return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
-                    default:
-                        std::cout << "Unsupported folded dimension" << std::endl;
-                        assert(false);
-                }
-            }
-        }
-
-        if (std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
-        {
-            _components_required.push_back(x);
-        }
-
-        return build_component_name(x);
-    }
-
-    std::string component_type_declaration() const override
-    {
-        return "int";
-    };
-
-    DataType component_data_type() const override
-    {
-        return DataType::Int32;
-    }
-
-    std::string storage(GpuTensorStorage x) override
-    {
-        if (std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
-        {
-            _storage_required.push_back(x);
-        }
-
-        return build_storage_name(x);
-    }
-
-    std::string storage_type_declaration(GpuTensorStorage x) const override
-    {
-        switch (x)
-        {
-            case GpuTensorStorage::BufferUint8Ptr:
-                return "__global uchar*";
-            case GpuTensorStorage::Image2dReadOnly:
-                return "__read_only image2d_t";
-            case GpuTensorStorage::Image2dWriteOnly:
-                return "__write_only image2d_t";
-            case GpuTensorStorage::Image3dReadOnly:
-                return "__read_only image3d_t ";
-            case GpuTensorStorage::Image3dWriteOnly:
-                return "__write_only image3d_t ";
-            default:
-                std::cout << "Unsupported storage" << std::endl;
-                assert(false);
-                return "";
-        }
-    };
-
-    std::vector<GpuTensorStorage> storage_declarations() const override
-    {
-        return _storage_required;
-    }
-
-    std::vector<TensorComponentType> component_declarations() const override
-    {
-        return _components_required;
-    }
-
-private:
-    std::string build_storage_name(GpuTensorStorage x) const
-    {
-        std::string var_name = _basename;
-
-        switch (x)
-        {
-            case GpuTensorStorage::BufferUint8Ptr:
-                return var_name + "_ptr";
-            case GpuTensorStorage::Image2dReadOnly:
-            case GpuTensorStorage::Image2dWriteOnly:
-                return var_name + "_img2d";
-            case GpuTensorStorage::Image3dReadOnly:
-            case GpuTensorStorage::Image3dWriteOnly:
-                return var_name + "_img3d";
-            default:
-                std::cout << "Unsupported storage" << std::endl;
-                assert(false);
-        }
-
-        return var_name;
-    }
-
-    std::string build_component_name(TensorComponentType x) const
-    {
-        std::string var_name = _basename;
-
-        switch (x)
-        {
-            case TensorComponentType::OffsetFirstElement:
-                return var_name + "_offset_first_element";
-            case TensorComponentType::Stride1:
-                return var_name + "_stride1";
-            case TensorComponentType::Stride2:
-                return var_name + "_stride2";
-            case TensorComponentType::Stride3:
-                return var_name + "_stride3";
-            case TensorComponentType::Dim0:
-                return var_name + "_dim0";
-            case TensorComponentType::Dim1:
-                return var_name + "_dim1";
-            case TensorComponentType::Dim2:
-                return var_name + "_dim2";
-            case TensorComponentType::Dim3:
-                return var_name + "_dim3";
-            case TensorComponentType::Dim1xDim2:
-                return var_name + "_dim1xdim2";
-            case TensorComponentType::Dim1xDim2xDim3:
-                return var_name + "_dim1xdim2xdim3";
-            default:
-                std::cout << "Unsupported component" << std::endl;
-                assert(false);
-        }
-
-        return var_name;
-    }
-
-    bool                             _return_by_value_when_possible{false};
-    std::vector<GpuTensorStorage>    _storage_required{};
-    std::vector<TensorComponentType> _components_required{};
-};
-
-/**
- * @brief Data structure that contains the declared tiles by the components.
- * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
- * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
- * and remove (pop) all the tiles from the level above.
- * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
- * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tiles among different components.
- *
- */
-class GpuTileRegistry
-{
-public:
-    enum class RegistryTileType
-    {
-        Tile,
-        Link
-    };
-
-    using RegistryIdSpace  = int32_t;
-    using RegistryLevel    = int32_t;
-    using RegistryTileName = std::string;
-
-    struct RegistryTileTableEntry
-    {
-        RegistryLevel                registry_level{0};
-        std::unique_ptr<IVectorTile> tile_object{nullptr};
-    };
-
-    struct RegistryTileTypeTableEntry
-    {
-        RegistryTileType tile_type{RegistryTileType::Tile};
-        RegistryTileName tile_name{};
-        RegistryIdSpace  registry_idspace{0};
-        RegistryLevel    registry_level{0};
-    };
-
-    using RegistryTileTable     = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
-    using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
-
-    /**
-     * @brief Construct a new Gpu Tile Registry object
-     *
-     */
-    GpuTileRegistry()
-    {
-        _language = GpuTargetLanguage::Unknown;
-    }
-
-    /**
-     * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuTileRegistry(GpuTargetLanguage language)
-    {
-        _language = language;
-    }
-
-    /**
-     * @brief Default destructor. Destroy the Gpu Tile Registry object
-     *
-     */
-    ~GpuTileRegistry() = default;
-
-    /**
-     * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
-     *        Therefore, the IdSpace should be set before declaring any tiles.
-     *
-     * @param[in] id The IdSpace id
-     */
-    void set_IdSpace(int32_t id)
-    {
-        _IdSpace = id;
-    }
-
-    /**
-     * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
-     *
-     * @return The IdSpace id
-     */
-    int32_t IdSpace() const
-    {
-        return _IdSpace;
-    }
-
-    /**
-     * @brief Gets all the IdSpace declarations defined in the tile registry.
-     *
-     * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
-     */
-    std::vector<int32_t> IdSpace_declarations() const
-    {
-        std::vector<int32_t> x;
-
-        auto it = _frags.begin();
-
-        while (it != _frags.end())
-        {
-            x.push_back(it->first);
-
-            it++;
-        }
-
-        return x;
-    }
-
-    /**
-     * @brief Declare a tile from a previously created tile
-     */
-    void insert(const std::string &name, const IVectorTile *frag)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-        const std::string var_name     = frag->name();
-        TileInfo          format       = frag->format();
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
-
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Link;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-
-    /**
-     * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
-     *
-     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
-     *
-     * @param[in] name   Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
-     * @param[in] format Tile format use to use
-     */
-    void insert(const std::string &name, const TileInfo &format)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-        const std::string var_name     = generate_tile_name(name);
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClTile> tile                     = std::make_unique<ClTile>(var_name, format);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-
-    /**
-     * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
-     *
-     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
-     *
-     * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
-     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
-     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
-     *                 that the data type is aligned with the content of the std::string.
-     */
-    void insert(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-
-    /**
-     * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
-     *
-     * @note This method can be used to declare temporary tiles that need to be accessed only once.
-     *
-     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
-     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
-     *                 that the data type is aligned with what passed with the std::string.
-     *
-     * @return IVectorTile* the anonymous constant tile
-     */
-    IVectorTile *insert(const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-
-        return (*this)[key_var_name];
-    }
-
-    /**
-     * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
-     *
-     * @param[in] name         The name of the tile to retrieve
-     * @param[in] IdSpace The IdSpace id where to search the tile
-     *
-     * @return IVectorTile* The tile
-     */
-    IVectorTile *get(const std::string &name, int32_t IdSpace)
-    {
-        const int32_t     key_IdSpace  = IdSpace;
-        const std::string key_var_name = name;
-
-        IVectorTile *result         = nullptr;
-        auto         search_IdSpace = _frags.find(key_IdSpace);
-        if (search_IdSpace != _frags.end())
-        {
-            auto search_tile = _frags[key_IdSpace].find(key_var_name);
-            if (search_tile != _frags[key_IdSpace].end())
-            {
-                result = search_tile->second.tile_object.get();
-                assert(result != nullptr);
-            }
-        }
-
-        return result;
-    }
-
-    /**
-     * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
-     *
-     * @param[in] name The name of the tile to retrieve
-     *
-     * @return IVectorTile* The tile
-     */
-    IVectorTile *operator[](const std::string &name)
-    {
-        return get(name, _IdSpace);
-    }
-
-    /**
-     * @brief Check whether the tile in the in the IdSpace provided by the user exists
-     *
-     * @param[in] name         Name of the tile to search for
-     * @param[in] IdSpace The IdSpace id where to search the tile
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tile(const std::string &name, int32_t IdSpace) const
-    {
-        const int32_t     key_IdSpace  = IdSpace;
-        const std::string key_var_name = name;
-
-        // IVectorTile* result = nullptr;
-        auto search_IdSpace = _frags.find(key_IdSpace);
-
-        return search_IdSpace != _frags.end();
-    }
-
-    /**
-     * @brief Check whether the tile within the current IdSpace exists
-     *
-     * @param[in] name Name of the tile to search for
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tile(const std::string &name) const
-    {
-        return has_tile(name, _IdSpace);
-    }
-
-    /**
-     * @brief Get all the tiles declared within the IdSpace provided by the user
-     *
-     * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
-     *
-     * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
-     */
-    std::vector<IVectorTile *> tile_declarations(int32_t IdSpace)
-    {
-        std::vector<IVectorTile *> tiles;
-
-        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
-
-        while (it != _frag_types[IdSpace].end())
-        {
-            // The following line should be enabled. However, we cannot at this stage
-            // because it used to retrieve the output tile produced by each component.
-            // However, this method should NOT be used to retrieve the output tile
-            //if(it->second.tile_type == RegistryTileType::Tile)
-            {
-                tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
-            }
-            it++;
-        }
-
-        return tiles;
-    }
-
-    /**
-     * @brief Increase the level of stack.
-     *
-     */
-    void increment_registry_level()
-    {
-        _registry_level++;
-    }
-
-    /**
-     * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
-     *
-     */
-    void decrement_registry_level()
-    {
-        assert(_registry_level >= 0);
-
-        // Remove all variables in the local scope
-        std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
-
-        while (it != _frags[_IdSpace].end())
-        {
-            if (it->second.registry_level == _registry_level)
-            {
-                it = _frags[_IdSpace].erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-
-        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
-
-        while (it_type != _frag_types[_IdSpace].end())
-        {
-            if (it_type->second.registry_level == _registry_level)
-            {
-                it_type = _frag_types[_IdSpace].erase(it_type);
-            }
-            else
-            {
-                it_type++;
-            }
-        }
-
-        _registry_level--;
-    }
-
-    /**
-     * @brief Get the level of the stack
-     *
-     */
-    int32_t level() const
-    {
-        return _registry_level;
-    }
-
-private:
-    // This method ensures that the key is unique among different components
-    std::string generate_tile_name(const std::string &name)
-    {
-        assert(_IdSpace >= 0);
-        if (_registry_level == 0)
-        {
-            return "_G" + std::to_string(_IdSpace) + "_" + name;
-        }
-        else
-        {
-            return name;
-        }
-    }
-
-    RegistryTileTable     _frags{};
-    RegistryTileTypeTable _frag_types{};
-    RegistryLevel         _registry_level{0};
-    RegistryIdSpace       _IdSpace{-1};
-    int32_t               _anonymous_frag_count{0};              // Counter used to create the anonymous tiles
-    GpuTargetLanguage     _language{GpuTargetLanguage::Unknown}; // Gpu programming language
-};
-
-using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
-
-/**
- * @brief Data structure that contains the tensors consumed by the components.
- * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tensors among different components.
- *
- */
-class GpuTensorArgumentRegistry
-{
-public:
-    /**
-     * @brief Construct a new Gpu Tensor Registry object
-     *
-     */
-    GpuTensorArgumentRegistry()
-    {
-        _language = GpuTargetLanguage::Unknown;
-    }
-
-    /**
-     * @brief Construct a new Gpu Tensor Registry object
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuTensorArgumentRegistry(GpuTargetLanguage language)
-    {
-        _language = language;
-    }
-
-    /**
-     * @brief Default destructor. Destroy the Gpu Tensor Registry object
-     *
-     */
-    ~GpuTensorArgumentRegistry() = default;
-
-    /**
-     * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
-     *        Therefore, the IdSpace should be set before declaring any tensors.
-     *
-     * @param[in] id The IdSpace id
-     */
-    void set_IdSpace(int32_t id)
-    {
-        _IdSpace = id;
-    }
-
-    /**
-     * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
-     *
-     * @return The IdSpace id
-     */
-    int32_t IdSpace() const
-    {
-        return _IdSpace;
-    }
-
-    /**
-     * @brief Gets all the IdSpace declarations defined in the tensor registry.
-     *
-     * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
-     */
-    std::vector<int32_t> IdSpace_declarations() const
-    {
-        std::vector<int32_t> x;
-
-        auto it = _refs.begin();
-
-        while (it != _refs.end())
-        {
-            x.push_back(it->first);
-
-            it++;
-        }
-
-        return x;
-    }
-
-    /**
-     * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
-     *
-     * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
-     *
-     * @param[in] name                          Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
-     * @param[in] x                             Pair of tensor info and tensor id
-     * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
-     */
-    void insert(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const int32_t     tensor_id    = x.id;
-        const std::string key_var_name = name;
-        const std::string var_name     = generate_tensor_name(name, tensor_id);
-
-        // First, check whether the tensor has already a reference. If so, trigger an assert
-        assert(!has_tensor_argument(name));
-
-        // Check whether a tensor with that tensorID exists
-        auto result = _tensor_arguments.find(tensor_id);
-        if (result == _tensor_arguments.end())
-        {
-            // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
-            std::unique_ptr<ClTensorArgument> arg =
-                std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
-            _tensor_arguments[tensor_id] = std::move(arg);
-        }
-
-        _refs[key_IdSpace][key_var_name] = tensor_id;
-    }
-
-    /**
-     * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
-     *
-     * @param[in] name The name of the tensor to retrieve
-     *
-     * @return IGpuTensor* The tensor
-     */
-    IGpuTensorArgument *operator[](const std::string &name)
-    {
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        IGpuTensorArgument *result         = nullptr;
-        auto                search_IdSpace = _refs.find(key_IdSpace);
-        if (search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            if (search_tensor_id != _refs[key_IdSpace].end())
-            {
-                const int32_t tensor_id              = search_tensor_id->second;
-                auto          search_tensor_argument = _tensor_arguments.find(tensor_id);
-                if (search_tensor_argument != _tensor_arguments.end())
-                {
-                    result = search_tensor_argument->second.get();
-                }
-                assert(result != nullptr);
-            }
-        }
-
-        return result;
-    }
-
-    /**
-     * @brief Get all the tensors declared in the IdSpace provided by the user
-     *
-     * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
-     */
-    std::vector<IGpuTensorArgument *> tensor_argument_declarations()
-    {
-        std::vector<IGpuTensorArgument *> args;
-
-        auto it = _tensor_arguments.begin();
-
-        while (it != _tensor_arguments.end())
-        {
-            args.push_back(it->second.get());
-            it++;
-        }
-
-        return args;
-    }
-
-    /**
-     * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
-     *
-     * @param[in] name Name of the tensor argument to search for
-     *
-     * @return true if the tensor argument exists
-     * @return false if the tensor argument does not exist
-     */
-    bool has_tensor_argument(const std::string &name)
-    {
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        auto search_IdSpace = _refs.find(key_IdSpace);
-
-        if (search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            return search_tensor_id != _refs[key_IdSpace].end();
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-    /**
-     * @brief Check whether the tensor argument is in the the IdSpace provided by the user
-     *
-     * @param[in] name    Name of the tensor argument to search for
-     * @param[in] IdSpace The IdSpace id where to search the tensor argument
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tensor_argument(const std::string &name, int32_t IdSpace)
-    {
-        const int32_t     key_IdSpace  = IdSpace;
-        const std::string key_var_name = name;
-
-        auto search_IdSpace = _refs.find(key_IdSpace);
-
-        if (search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            return search_tensor_id != _refs[key_IdSpace].end();
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-private:
-    // This method ensures that the key is unique among different components
-    std::string generate_tensor_name(const std::string &name, int32_t tensor_id)
-    {
-        assert(tensor_id >= 0);
-
-        return name + std::to_string(tensor_id);
-    }
-
-    std::map<int32_t, TensorEntry>                    _tensor_arguments{};
-    std::map<int32_t, std::map<std::string, int32_t>> _refs{};
-    int32_t                                           _IdSpace{-1};
-    GpuTargetLanguage                                 _language{GpuTargetLanguage::Unknown}; // Gpu programming language
-};
-
-enum class OpType : int32_t
-{
-    Elementwise = 0x0000,
-    Relational  = 0x1000,
-    Algebra     = 0x2000
-};
-
-inline std::string to_string(AssignmentOp op)
-{
-    switch (op)
-    {
-        case AssignmentOp::Decrement:
-            return "-=";
-        case AssignmentOp::Increment:
-            return "+=";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string to_string(UnaryOp op)
-{
-    switch (op)
-    {
-        case UnaryOp::LogicalNot:
-            return "!";
-        case UnaryOp::BitwiseNot:
-            return "~";
-        case UnaryOp::Negate:
-            return "-";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string to_string(BinaryOp op)
-{
-    switch (op)
-    {
-        case BinaryOp::Add:
-            return "+";
-        case BinaryOp::Sub:
-            return "-";
-        case BinaryOp::Mul:
-            return "*";
-        case BinaryOp::Div:
-            return "/";
-        case BinaryOp::Mod:
-            return "%";
-        case BinaryOp::Equal:
-            return "==";
-        case BinaryOp::Less:
-            return "<";
-        case BinaryOp::LessEqual:
-            return "<=";
-        case BinaryOp::Greater:
-            return ">";
-        case BinaryOp::GreaterEqual:
-            return ">=";
-        case BinaryOp::LogicalAnd:
-            return "&&";
-        case BinaryOp::LogicalOr:
-            return "||";
-        case BinaryOp::BitwiseXOR:
-            return "^";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string binary_op_string(BinaryOp op)
-{
-    switch (op)
-    {
-        case BinaryOp::Add:
-            return "add";
-        case BinaryOp::Sub:
-            return "sub";
-        case BinaryOp::Mul:
-            return "mul";
-        case BinaryOp::Div:
-            return "div";
-        case BinaryOp::Mod:
-            return "mod";
-        case BinaryOp::Equal:
-            return "eq";
-        case BinaryOp::Less:
-            return "gt";
-        case BinaryOp::LessEqual:
-            return "gteq";
-        case BinaryOp::Greater:
-            return "lt";
-        case BinaryOp::GreaterEqual:
-            return "lte";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-enum class OperandType : int32_t
-{
-    Unknown              = 0x00000000,
-    ScalarFp32           = 0x00001011, // Immediate scalar tile
-    ScalarFp16           = 0x00001012, // Immediate scalar tile
-    ScalarInt32          = 0x00001021, // Immediate scalar tile
-    ScalarInt16          = 0x00001022, // Immediate scalar tile
-    ScalarInt8           = 0x00001024, // Immediate scalar tile
-    ScalarUInt32         = 0x00001031, // Immediate scalar tile
-    ScalarUInt16         = 0x00001032, // Immediate scalar tile
-    ScalarUInt8          = 0x00001034, // Immediate scalar tile
-    ScalarBool           = 0x00001041, // Immediate scalar tile
-    ScalarTile           = 0x00001050, // Scalar from a tile
-    Tile                 = 0x00010000, // Tile
-    TensorStride1        = 0x00100001, // Tensor component
-    TensorStride2        = 0x00100002, // Tensor component
-    TensorStride3        = 0x00100003, // Tensor component
-    TensorStride4        = 0x00100004, // Tensor component
-    TensorDim0           = 0x00100010, // Tensor component
-    TensorDim1           = 0x00100020, // Tensor component
-    TensorDim2           = 0x00100030, // Tensor component
-    TensorDim3           = 0x00100040, // Tensor component
-    TensorDim4           = 0x00100050, // Tensor component
-    TensorC              = 0x00100010, // Tensor component
-    TensorW              = 0x00100020, // Tensor component
-    TensorH              = 0x00100030, // Tensor component
-    TensorD              = 0x00100040, // Tensor component
-    TensorN              = 0x00100050, // Tensor component
-    TensorDim1xDim2      = 0x00100100, // Tensor component
-    TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
-    TensorWxH            = 0x00100300, // Tensor component
-    TensorWxHxD          = 0x00100400, // Tensor component
-    TensorDataOffset     = 0x00100500, // Tensor component
-};
-
-struct ScalarTileCoord
-{
-    ScalarTileCoord()
-    {
-    }
-
-    ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0)
-    {
-    }
-
-    int32_t x{-1};
-    int32_t y{-1};
-};
-
-/**
- * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
- * Operand can be of three types:
- * -# Scalar immediate: constant expression
- * -# Tile: A tile
- * -# Tensor component: A component (scalar) of a tensor
- *
- */
-class Operand
-{
-public:
-    Operand(const std::string &val)
-    {
-        _str  = val;
-        _type = OperandType::Tile;
-    }
-
-    Operand(const std::string &val, const ScalarTileCoord &coord)
-    {
-        _str   = val;
-        _type  = OperandType::ScalarTile;
-        _coord = coord;
-    }
-
-    Operand(const std::string &val, OperandType type)
-    {
-        _str  = val;
-        _type = type;
-    }
-
-    Operand(const Operand &t)
-    {
-        _str  = t.value();
-        _type = t.type();
-    }
-
-    Operand &operator=(const Operand &t)
-    {
-        _str   = t.value();
-        _type  = t.type();
-        _coord = t.scalar_tile_coordinate();
-        return *this;
-    }
-
-    std::string value() const
-    {
-        return _str;
-    }
-
-    OperandType type() const
-    {
-        return _type;
-    }
-
-    ScalarTileCoord scalar_tile_coordinate() const
-    {
-        return _coord;
-    }
-
-private:
-    std::string     _str{};
-    OperandType     _type{OperandType::Unknown};
-    ScalarTileCoord _coord{};
-};
-
-using GpuSamplerTensorStorage = GpuTensorStorage;
-
-struct GpuSampler
-{
-    GpuSampler() = default;
-
-    TensorSamplerFormat       format{TensorSamplerFormat::Unknown};
-    GpuSamplerTensorStorage   storage{GpuSamplerTensorStorage::Unknown};
-    TensorSamplerAddressModeX address_mode_x{TensorSamplerAddressModeX::Unknown};
-    TensorSamplerAddressModeY address_mode_y{TensorSamplerAddressModeY::Unknown};
-    TensorSamplerAddressModeZ address_mode_z{TensorSamplerAddressModeZ::Unknown};
-};
-
-inline GpuSampler create_simple_sampler(
-    const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
-{
-    CKW_UNUSED(step_x, step_y, step_z);
-
-    auto tensor = tensor_info_id->shape;
-
-    GpuSampler dst_sampler;
-    dst_sampler.format         = sampler.format;
-    dst_sampler.storage        = GpuSamplerTensorStorage::BufferUint8Ptr;
-    dst_sampler.address_mode_x = sampler.address_mode_x;
-    dst_sampler.address_mode_y = sampler.address_mode_y;
-    dst_sampler.address_mode_z = sampler.address_mode_z;
-
-    int32_t dim_x = 0;
-    int32_t dim_y = 0;
-    int32_t dim_z = 0;
-
-    switch (sampler.format)
-    {
-        case TensorSamplerFormat::C_W_H:
-            dim_x = tensor[0];
-            dim_y = tensor[1];
-            dim_z = tensor[2];
-            break;
-        case TensorSamplerFormat::C_WH_1:
-            dim_x = tensor[0];
-            dim_y = tensor[1] * tensor[2];
-            dim_z = 1;
-            break;
-        default:
-            std::cout << "Unsupported tensor format" << std::endl;
-            assert(false);
-            break;
-    }
-
-    if (dim_x == 1)
-    {
-        assert(step_x == 1);
-        dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
-    }
-
-    if (dim_y == 1)
-    {
-        assert(step_y == 1);
-        dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
-    }
-
-    if (dim_z == 1)
-    {
-        assert(step_z == 1);
-        dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-    }
-
-    return dst_sampler;
-}
-
-class GpuOutputSampler
-{
-public:
-    GpuOutputSampler() = default;
-
-    /**
-     * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
-     *        by the root component. Once initialized, all simpler components will need to used this sampler
-     *        or a broadcasted version of it
-     *
-     * @param[in] sampler GpuSampler
-     * @param[in] step_x  Increment step in the X direction. Not necessarily it is the same of n0 of tile!
-     * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
-     * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
-     */
-    void initialize(const TensorInfo       *tensor_info_id,
-                    GpuSamplerTensorStorage tensor_storage,
-                    TensorSamplerFormat     tensor_format,
-                    int32_t                 step_x,
-                    int32_t                 step_y,
-                    int32_t                 step_z)
-    {
-        assert(_is_initialized == false);
-
-        _step_x         = step_x;
-        _step_y         = step_y;
-        _step_z         = step_z;
-        _tensor_info_id = tensor_info_id;
-        _sampler        = create_sampler(tensor_storage, tensor_format);
-        _is_initialized = true;
-    };
-
-    GpuSampler sampler() const
-    {
-        return _sampler;
-    };
-
-    int32_t step_x() const
-    {
-        return _step_x;
-    };
-
-    int32_t step_y() const
-    {
-        return _step_y;
-    };
-
-    int32_t step_z() const
-    {
-        return _step_z;
-    };
-
-private:
-    GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
-    {
-        // Output can only be in output mode
-        assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
-        assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
-
-        auto tensor = _tensor_info_id->shape;
-
-        GpuSampler sampler;
-        sampler.format         = tensor_format;
-        sampler.storage        = tensor_storage;
-        sampler.address_mode_x = TensorSamplerAddressModeX::None;
-        sampler.address_mode_y = TensorSamplerAddressModeY::None;
-        sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-
-        // In the case of texture, we do not need any special checks at the border
-        if (tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
-        {
-            int32_t dim_x = 0;
-            int32_t dim_y = 0;
-            int32_t dim_z = 0;
-
-            switch (tensor_format)
-            {
-                case TensorSamplerFormat::C_W_H:
-                    dim_x = tensor[0];
-                    dim_y = tensor[1];
-                    dim_z = tensor[2];
-                    break;
-                case TensorSamplerFormat::C_WH_1:
-                    dim_x = tensor[0];
-                    dim_y = tensor[1] * tensor[2];
-                    dim_z = 1;
-                    break;
-                default:
-                    std::cout << "Unsupported tensor format" << std::endl;
-                    assert(false);
-                    break;
-            }
-
-            if ((dim_x % _step_x) != 0 && dim_x != 1)
-            {
-                sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
-            }
-
-            if ((dim_y % _step_y) != 0 && dim_y != 1)
-            {
-                sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
-            }
-
-            if ((dim_z % _step_z) != 0 && dim_z != 1)
-            {
-                sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
-            }
-        }
-
-        return sampler;
-    }
-
-    GpuSampler        _sampler{}; // GpuSampler
-    int32_t           _step_x{1};
-    int32_t           _step_y{1};
-    int32_t           _step_z{1};
-    const TensorInfo *_tensor_info_id{nullptr};
-    bool              _is_initialized{false};
-};
-
-/**
- * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
- */
-class TensorOperand
-{
-public:
-    TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
-    {
-    }
-
-    TensorOperand &operator=(const TensorOperand &t)
-    {
-        _str     = t.value();
-        _sampler = t.sampler();
-        return *this;
-    }
-
-    std::string value() const
-    {
-        return _str;
-    }
-
-    GpuSampler sampler() const
-    {
-        return _sampler;
-    }
-
-private:
-    std::string _str{};
-    GpuSampler  _sampler{};
-};
-
-/**
- * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
- *        This data structure must be initialized before being passed to the Gpu Kernel Writer
- *
- */
-class GpuKernelWriterDataHolder
-{
-public:
-    /**
-     * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
-     *        the GPU target and target specific capabilities (extensions). For now, we just initialize the
-     *        programming language
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuKernelWriterDataHolder(GpuTargetLanguage language)
-        : tiles(language), arguments(language), code(""), _language(language)
-    {
-    }
-
-    /**
-     * @brief Get the Gpu programming language used
-     *
-     * @return GpuTargetLanguage the Gpu programming language
-     */
-    GpuTargetLanguage programming_language() const
-    {
-        return _language;
-    }
-
-    /**
-     * @brief @ref GpuTileRegistry
-     *
-     */
-    GpuTileRegistry tiles{};
-    /**
-     * @brief @ref GpuTensorArgumentRegistry
-     *
-     */
-    GpuTensorArgumentRegistry arguments{};
-    /**
-     * @brief @ref GpuOutputSampler.
-     *
-     */
-    GpuOutputSampler output_sampler{};
-    /**
-     * @brief Source code
-     *
-     */
-    std::string code{};
-
-    // GpuExtensionRegistry extensions{};
-private:
-    GpuTargetLanguage _language;
-};
-
-struct LWS
-{
-    int32_t x{1};
-    int32_t y{1};
-    int32_t z{1};
-};
-
-/**
- * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
- *        declare an anonymous tile in the tile registry.
- */
-class OperandUnpacker
-{
-public:
-    OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) : _tiles(tiles), _arguments(arguments)
-    {
-        // Increase the level of the stack to allocate possible temporary tiles
-        _tiles.increment_registry_level();
-    };
-
-    ~OperandUnpacker()
-    {
-        // Decrease the level of the stack to deallocate any temporary tiles
-        _tiles.decrement_registry_level();
-    }
-
-    IVectorTile *unpack(const Operand &src)
-    {
-        // Get the tile
-        if (src.type() == OperandType::Tile)
-        {
-            assert(_tiles.has_tile(src.value()));
-            return _tiles[src.value()];
-        }
-        // Create an anonymous tile with a constant
-        else if (static_cast<int32_t>(src.type()) & 0x00001000)
-        {
-            if (src.type() == OperandType::ScalarTile)
-            {
-                ScalarTileCoord coord = src.scalar_tile_coordinate();
-                assert(_tiles.has_tile(src.value()));
-                assert(coord.x >= 0);
-                assert(coord.y >= 0);
-                auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
-                return _tiles.insert({{{val.str}}}, val.type.dt);
-            }
-            else
-            {
-                return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
-            }
-        }
-        // Create an anonymous tile with the tensor component
-        else
-        {
-            assert(_arguments.has_tensor_argument(src.value()));
-            auto              x   = _arguments[src.value()];
-            const std::string val = x->component(to_tensor_component(src.type()));
-            const DataType    dt  = x->component_data_type();
-            return _tiles.insert({{{val}}}, dt);
-        }
-    }
-
-private:
-    DataType to_tile_data_type(OperandType x)
-    {
-        return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
-    }
-
-    TensorComponentType to_tensor_component(OperandType x)
-    {
-        switch (x)
-        {
-            case OperandType::TensorDim0:
-                return TensorComponentType::Dim0;
-            case OperandType::TensorDim1:
-                return TensorComponentType::Dim1;
-            case OperandType::TensorDim2:
-                return TensorComponentType::Dim2;
-            case OperandType::TensorDim3:
-                return TensorComponentType::Dim3;
-            case OperandType::TensorDim4:
-                return TensorComponentType::Dim4;
-            case OperandType::TensorStride1:
-                return TensorComponentType::Stride1;
-            case OperandType::TensorStride2:
-                return TensorComponentType::Stride2;
-            case OperandType::TensorStride3:
-                return TensorComponentType::Stride3;
-            case OperandType::TensorStride4:
-                return TensorComponentType::Stride4;
-            case OperandType::TensorDim1xDim2:
-                return TensorComponentType::Dim1xDim2;
-            case OperandType::TensorDim1xDim2xDim3:
-                return TensorComponentType::Dim1xDim2xDim3;
-            case OperandType::TensorDataOffset:
-                return TensorComponentType::OffsetFirstElement;
-            default:
-                assert(false);
-                return TensorComponentType::Unknown;
-        }
-    }
-
-    GpuTileRegistry           &_tiles;
-    GpuTensorArgumentRegistry &_arguments;
-};
-
-/**
- * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
- *        declare an anonymous tile in the tile registry.
- *        Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
- */
-class TensorOperandUnpacker
-{
-public:
-    TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) : _arguments(arguments){};
-
-    IGpuTensorArgument *unpack(const TensorOperand &src)
-    {
-        assert(_arguments.has_tensor_argument(src.value()));
-        return _arguments[src.value()];
-    }
-
-private:
-    GpuTensorArgumentRegistry &_arguments;
-};
-
-/**
- * @brief The GpuKernel will be used in three occasions (stages):
- * #- Compilation stage
- * #- Tuning stage
- * #- Dispatch stage
- */
-struct GpuKernel
-{
-    // Compilation stage
-    std::string                code{};            // Source code, required for the compilation stage
-    std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
-    // Tuning stage
-    std::string      config_id{}; // Unique id, required for the tuning stage
-    std::vector<LWS> list_lws{};  // LWS to test, required for the tuning stage
-    // Dispatch stage
-    GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
-    std::vector<std::pair<int32_t, GpuTensorStorage>>
-        list_tensor_storages; // List of tensor storages, required for the dispatch stage
-    std::vector<std::pair<int32_t, TensorComponentType>>
-        list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
-};
-
-// Generate all extension pragmas (hardcoded for now)
-inline std::string generate_extensions()
-{
-    std::string ext = R"(
-#if defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(cl_khr_fp16)
-
-#if defined(cl_arm_integer_dot_product_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(cl_arm_integer_dot_product_int8)
-
-#if defined(cl_arm_integer_dot_product_accumulate_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#if defined(cl_arm_printf)
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf);
-)";
-    return ext;
-}
-
-// This function should produce an object with the source
-inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string &name)
-{
-    std::string code;
-    code += generate_extensions();
-    code += "__kernel void ";
-    code += name;
-    code += "(\n";
-
-    auto IdSpaces = in.arguments.IdSpace_declarations();
-
-    std::vector<std::string> arg_str;
-
-    auto tensor_args = in.arguments.tensor_argument_declarations();
-
-    for (auto &i : tensor_args)
-    {
-        // For each tensor used, get the storage and tensor components
-        auto storages   = i->storage_declarations();
-        auto components = i->component_declarations();
-
-        for (auto &y : storages)
-        {
-            std::string str;
-            str += i->storage_type_declaration(y);
-            str += " ";
-            str += i->storage(y);
-            arg_str.push_back(str);
-        }
-
-        for (auto &y : components)
-        {
-            std::string str;
-            str += i->component_type_declaration();
-            str += " ";
-            str += i->component(y);
-            arg_str.push_back(str);
-        }
-    }
-
-    for (size_t i = 0; i < arg_str.size(); ++i)
-    {
-        code += arg_str[i];
-        if (i + 1 < arg_str.size())
-        {
-            code += ",\n";
-        }
-    }
-
-    code += ")\n";
-    code += "{\n";
-    code += in.code;
-    code += "}\n";
-
-    return code;
-}
-
-/**
- * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
- * how to reduce the dimensionality of a tensor
- *
- */
-class GpuTensor3dMapper
-{
-public:
-    GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor){};
-
-    std::string tensor_component_x() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Dim0);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_y() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return _tensor->component(TensorComponentType::Dim1xDim2);
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Dim1);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_z() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return "1";
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Dim2);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_y() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Stride1);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_z() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return "0";
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Stride2);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_batch() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Stride3);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_x() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[0] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_y() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return (t.shape[1] * t.shape[2]) == 1;
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[1] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_z() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return true;
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[2] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_batch() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[3] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    GpuSampler gpu_sampler() const
-    {
-        return _sampler;
-    }
-
-    IGpuTensorArgument *tensor_argument() const
-    {
-        return _tensor;
-    }
-
-private:
-    GpuSampler          _sampler;
-    IGpuTensorArgument *_tensor;
-};
-
-struct GpuKernelWriterAttribute
-{
-    bool return_tensor_component_by_value{false};
-};
-
-enum class RoundingMode
-{
-    None,
-    Rte,
-    Rtz,
-    Rtp,
-    Rtn
-};
-
-// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
-class IGpuKernelWriter
-{
-public:
-    virtual ~IGpuKernelWriter() = default;
-
-    virtual void set_IdSpace(int32_t id) = 0;
-
-    virtual void import_tile(const std::string &dst, const IVectorTile *src) = 0;
-
-    virtual void declare_argument(const std::string &name, const TensorInfo &tensor) = 0;
-
-    virtual void declare_tile(const std::string &name, const TileInfo &info) = 0;
-
-    virtual void
-    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
-
-    virtual void write_text(const std::string &x) = 0;
-
-    virtual void compound_statement_begin() = 0;
-
-    virtual void compound_statement_end() = 0;
-
-    // Operations
-    virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0;
-
-    virtual void
-    op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0;
-
-    virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0;
-
-    virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0;
-
-    virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0;
-
-    virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
-    virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0;
-
-    virtual void
-    op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0;
-
-    virtual void op_binary_elementwise_function(const Operand &dst_name,
-                                                BinaryFunction func,
-                                                const Operand &first_name,
-                                                const Operand &second_name) = 0;
-
-    virtual void op_ternary_elementwise_function(const Operand  &dst_name,
-                                                 TernaryFunction func,
-                                                 const Operand  &first_name,
-                                                 const Operand  &second_name,
-                                                 const Operand  &third_name) = 0;
-
-    virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
-    virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
-    virtual void op_else_header() = 0;
-
-    virtual void op_for_loop_header(const Operand &var_name,
-                                    BinaryOp       cond_op,
-                                    const Operand &cond_value,
-                                    const Operand &update_var,
-                                    AssignmentOp   update_op,
-                                    const Operand &update_value) = 0;
-
-    virtual void op_load_indirect(const TensorOperand &tensor,
-                                  const Operand       &dst,
-                                  const Operand       &x,
-                                  const Operand       &y_indirect,
-                                  const Operand       &z,
-                                  const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
-
-    virtual void op_load_immediate(const TensorOperand &tensor,
-                                   const Operand       &dst,
-                                   const Operand       &x,
-                                   const Operand       &y,
-                                   const Operand       &z,
-                                   const Operand       &b          = Operand("0", OperandType::ScalarInt32),
-                                   const Operand       &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
-
-    virtual void op_store_immediate(const TensorOperand &tensor,
-                                    const Operand       &src,
-                                    const Operand       &x,
-                                    const Operand       &y,
-                                    const Operand       &z,
-                                    const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
-
-    virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0;
-
-    virtual void op_return() = 0;
-
-    // Utils
-    // It is the process of converting
-    virtual void util_get_indirect_buffer(const Operand       &dst,
-                                          const TensorOperand &tensor,
-                                          const Operand       &x,
-                                          const Operand       &y,
-                                          const Operand       &x_off,
-                                          const Operand       &y_off) = 0;
-};
-
-enum class GpuLoadStoreType
-{
-    Load  = 1,
-    Store = 2
-};
-
-class IGpuLoadStoreHelperWriter
-{
-public:
-    IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type)
-        : _writer(x), _mapper(mapper), _type(type)
-    {
-    }
-
-    IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
-
-    IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
-
-    virtual ~IGpuLoadStoreHelperWriter() = default;
-
-    virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
-
-    virtual void write(const std::pair<int32_t, std::string> &y) = 0;
-
-    virtual void finalize() = 0;
-
-protected:
-    IGpuKernelWriter *_writer;
-    GpuTensor3dMapper _mapper;
-    GpuLoadStoreType  _type;
-};
-
-class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
-    ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
-        : IGpuLoadStoreHelperWriter(x, mapper, type)
-    {
-    }
-
-    ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
-
-    ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
-
-    static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
-    {
-        CKW_UNUSED(x, type, dst);
-
-        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
-        {
-            return false;
-        }
-        return true;
-    }
-
-    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
-    {
-        assert(validate(_writer, _mapper, _type, dst));
-
-        _dst           = dst;
-        _ls_width_full = dst->format().w;
-
-        _coord_x      = x->scalar(0, 0).str;
-        _coord_z      = z->scalar(0, 0).str;
-        _coord_b      = b->scalar(0, 0).str;
-        _coord_orig_z = _coord_z;
-
-        out_of_bound_initialize_x(_coord_x);
-        out_of_bound_initialize_z(_coord_z);
-
-        /*
-        meaning of else:
-        - x: partial load/store
-        - y: no load/store operation
-        - z: no load/store operation
-        if(x)
-        {
-            if(z)
-            {
-                if(y)
-                {
-                    // full load/store width
-                }
-                else
-                {
-                    // no load/store
-                }
-            }
-            else
-            {
-                // no load/store
-            }
-        }
-        else
-        {
-            if(z)
-            {
-                if(y)
-                {
-                    // partial load/store width
-                }
-                else
-                {
-                    // no load/store
-                }
-            }
-            else
-            {
-                // no load/store
-            }
-        }
-        */
-    }
-
-    void write(const std::pair<int32_t, std::string> &y) override
-    {
-        int32_t     idx_y   = y.first;
-        std::string coord_y = y.second;
-
-        // The only check required is on Y.
-        out_of_bound_initialize_y(coord_y);
-
-        const std::string dst     = _dst->vector(idx_y).str;
-        const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
-        const std::string ls_buf  = to_ls_buffer(_type, _ls_width_full, dst, address);
-
-        _writer->write_text(ls_buf);
-        _writer->write_text(";\n");
-
-        out_of_bound_finalize_y(dst);
-
-        // The left over load/store will be written in the finalize stage
-        if (_ls_width_part.size() != 0)
-        {
-            int32_t w = 0;
-            for (auto &p : _ls_width_part)
-            {
-                const std::string dst0    = _dst->vector(w, p, idx_y).str;
-                const std::string coord_x = _coord_x + " + " + std::to_string(w);
-                const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
-                const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
-                _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
-
-                w += p;
-            }
-        }
-    }
-
-    void finalize() override
-    {
-        out_of_bound_finalize_z();
-        out_of_bound_finalize_x();
-    }
-
-private:
-    IVectorTile                                                             *_dst{nullptr};
-    int32_t                                                                  _ls_width_full{0};
-    std::vector<int32_t>                                                     _ls_width_part{};
-    std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x{};
-    std::string                                                              _coord_x{};
-    std::string                                                              _coord_z{};
-    std::string                                                              _coord_orig_z{};
-    std::string                                                              _coord_b{};
-
-    void out_of_bound_initialize_x(std::string &coord)
-    {
-        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-        {
-            auto tensor_format = _mapper.tensor_argument()->format();
-            auto shape         = tensor_format.shape;
-
-            _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
-            if (_ls_width_part.size() != 0)
-            {
-                _writer->write_text("if(" + coord + " > 0)\n");
-                _writer->compound_statement_begin();
-            }
-        }
-    };
-
-    void out_of_bound_finalize_x()
-    {
-        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-        {
-            if (_ls_width_part.size() != 0)
-            {
-                _writer->compound_statement_end();
-                _writer->write_text("else\n");
-                _writer->compound_statement_begin();
-
-                out_of_bound_initialize_z(_coord_orig_z);
-                for (auto &i : _leftovers_x)
-                {
-                    out_of_bound_initialize_y(i.first.second);
-                    _writer->write_text(i.second);
-                    _writer->write_text(";\n");
-                    out_of_bound_finalize_y(i.first.first);
-                }
-                out_of_bound_finalize_z();
-                _writer->compound_statement_end();
-            }
-        }
-    };
-
-    void out_of_bound_initialize_y(std::string &coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::ClampToBorder:
-                // NOTE: This line should not be moved outside of the switch statement.
-                // The reason for that is because when we query the component, the component is marked as used
-                // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
-                // we should request the component only when used
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-                _writer->write_text("if(" + coord + " >= 0)\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if(" + coord + " < " + max + ")\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::ClampToNearest:
-                max   = _mapper.tensor_component_y();
-                coord = "clamp(" + coord + ", 0, " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-                max   = _mapper.tensor_component_y();
-                coord = "min(" + coord + ", " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-                coord = "max(" + coord + ", 0)";
-                break;
-            case TensorSamplerAddressModeY::None:
-                break;
-            default:
-                std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
-                assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_y(const std::string &dst)
-    {
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-                _writer->compound_statement_end();
-                break;
-            case TensorSamplerAddressModeY::None:
-                break;
-
-            default:
-                assert(false);
-        }
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-                _writer->write_text("else\n");
-                _writer->compound_statement_begin();
-                _writer->write_text(dst);
-                _writer->write_text(" = 0.0f;\n");
-                _writer->compound_statement_end();
-                break;
-            case TensorSamplerAddressModeY::None:
-                break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    void out_of_bound_initialize_z(std::string &coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
-        switch (address_mode_z)
-        {
-            case TensorSamplerAddressModeZ::Skip:
-                max = _mapper.tensor_component_z();
-                _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
-                _writer->write_text("if(" + coord + " >= 0)\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
-                max = _mapper.tensor_component_z();
-                _writer->write_text("if(" + coord + " < " + max + ")\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeZ::ClampToNearest:
-                max   = _mapper.tensor_component_z();
-                coord = "clamp(" + coord + ", 0, " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
-                max   = _mapper.tensor_component_z();
-                coord = "min(" + coord + ", " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
-                coord = "max(" + coord + ", 0)";
-                break;
-            case TensorSamplerAddressModeZ::None:
-                break;
-            default:
-                std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
-                assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_z()
-    {
-        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
-        switch (address_mode_z)
-        {
-            case TensorSamplerAddressModeZ::Skip:
-            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
-                _writer->compound_statement_end();
-                break;
-            case TensorSamplerAddressModeZ::None:
-                break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
-    {
-        std::vector<int32_t> x;
-
-        switch (ls_leftover_vector_width)
-        {
-            case 0:
-                break;
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 8:
-            case 16:
-                x.push_back(ls_leftover_vector_width);
-                break;
-            case 5:
-                x.push_back(4);
-                x.push_back(1);
-                break;
-            case 6:
-                x.push_back(4);
-                x.push_back(2);
-                break;
-            case 7:
-                x.push_back(4);
-                x.push_back(3);
-                break;
-            case 9:
-                x.push_back(8);
-                x.push_back(1);
-                break;
-            case 10:
-                x.push_back(8);
-                x.push_back(2);
-                break;
-            case 11:
-                x.push_back(8);
-                x.push_back(3);
-                break;
-            case 12:
-                x.push_back(8);
-                x.push_back(4);
-                break;
-            case 13:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(1);
-                break;
-            case 14:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(2);
-                break;
-            case 15:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(3);
-                break;
-
-            default:
-                assert(false);
-        }
-        return x;
-    }
-
-    std::string
-    to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &address)
-    {
-        switch (type)
-        {
-            case GpuLoadStoreType::Load:
-                if (vector_width != 1)
-                {
-                    return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
-                }
-                else
-                {
-                    return data + " = *(" + address + ")";
-                }
-                break;
-            case GpuLoadStoreType::Store:
-                if (vector_width != 1)
-                {
-                    return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
-                }
-                else
-                {
-                    return "*(" + address + ") = " + data;
-                }
-                break;
-            default:
-                std::cout << "Unsupported GpuLoadStoreType" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string
-    to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
-    {
-        auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
-        assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
-        const std::string ptr_buf  = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
-
-        std::string address;
-        address += "(__global ";
-        address += dst_type;
-        address += "*)(";
-        address += ptr_buf;
-        if (x != "0" && (_mapper.is_one_component_x() != true))
-        {
-            address += " + (";
-            address += x + ") * sizeof(" + dst_type + ")";
-        }
-        if (y != "0")
-        {
-            const std::string stride_y = _mapper.tensor_component_stride_y();
-            address += " + (";
-            address += y + ")";
-            address += " * ";
-            address += stride_y;
-        }
-        if (z != "0")
-        {
-            const std::string stride_z = _mapper.tensor_component_stride_z();
-            address += " + (";
-            address += z + ")";
-            address += " * ";
-            address += stride_z;
-        }
-        if (b != "0" && (_mapper.is_one_component_batch() != true))
-        {
-            const std::string stride_b = _mapper.tensor_component_stride_batch();
-            address += " + (";
-            address += b + ")";
-            address += " * ";
-            address += stride_b;
-        }
-        address += ")";
-        return address;
-    }
-};
-
-class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
-    static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
-    {
-        CKW_UNUSED(x);
-
-        if (dst->format().w != 4)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly &&
-            type == GpuLoadStoreType::Store)
-        {
-            return false;
-        }
-        if ((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
-        {
-            return false;
-        }
-        return true;
-        /*
-        - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
-        - z: Only GpuSamplerAddressModeZ::None is supported
-        */
-    }
-
-    ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
-        : IGpuLoadStoreHelperWriter(x, mapper, type)
-    {
-    }
-
-    ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
-
-    ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
-
-    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
-    {
-        assert(validate(_writer, _mapper, _type, dst));
-
-        _dst           = dst;
-        _ls_width_full = dst->format().w;
-        _coord_x       = x->scalar(0, 0).str;
-        _coord_z       = z->scalar(0, 0).str;
-        _coord_b       = b->scalar(0, 0).str;
-
-        /*
-        if(y)
-        {
-            // full load/store width
-        }
-        else
-        {
-            // no load/store
-        }
-        */
-    }
-
-    void write(const std::pair<int32_t, std::string> &y) override
-    {
-        int32_t     idx_y   = y.first;
-        std::string coord_y = y.second;
-
-        // The only check required is on Y.
-        out_of_bound_initialize_y(coord_y);
-
-        const std::string dst     = _dst->vector(idx_y).str;
-        const std::string sampler = to_ls_image2d_sampler();
-        const std::string coord   = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
-        const std::string ls_buf  = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
-
-        _writer->write_text(ls_buf);
-        _writer->write_text(";\n");
-
-        out_of_bound_finalize_y(dst);
-    }
-
-    void finalize() override
-    {
-    }
-
-private:
-    IVectorTile *_dst{nullptr};
-    int32_t      _ls_width_full{0};
-    std::string  _coord_x{};
-    std::string  _coord_z{};
-    std::string  _coord_b{};
-
-    void out_of_bound_initialize_y(std::string &coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-                _writer->write_text("if(" + coord + " >= 0)\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if(" + coord + " < " + max + ")\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToNearest:
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-            case TensorSamplerAddressModeY::None:
-                break;
-            default:
-                std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
-                assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_y(const std::string &dst)
-    {
-        CKW_UNUSED(dst);
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-                _writer->compound_statement_end();
-                break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    std::string to_ls_image2d(GpuLoadStoreType   type,
-                              int32_t            vector_width,
-                              const std::string &data,
-                              const std::string &sampler,
-                              const std::string &coord)
-    {
-        CKW_UNUSED(vector_width);
-
-        auto              tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
-        const std::string image2d_obj    = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string post_fix       = _dst->format().dt == DataType::Fp32 ? "f" : "h";
-
-        switch (type)
-        {
-            case GpuLoadStoreType::Load:
-                return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
-                break;
-            case GpuLoadStoreType::Store:
-                return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
-            default:
-                assert(false);
-                std::cout << "Unsupported GpuLoadStoreType" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string to_ls_image2d_sampler() const
-    {
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::None:
-                return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-                return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
-            case TensorSamplerAddressModeY::ClampToNearest:
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-                return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
-            default:
-                std::cout << "Unsupported address_mode_coord" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string
-    to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
-    {
-        std::string coord_x = "(" + x + ") >> 2";
-        std::string coord_y = "(";
-
-        if (y != "0")
-        {
-            coord_y += y;
-        }
-        if (z != "0" && (_mapper.is_one_component_z() != true))
-        {
-            const std::string dim = _mapper.tensor_component_y();
-            coord_y += " + (";
-            coord_y += z + ")";
-            coord_y += " * ";
-            coord_y += dim;
-        }
-        if (b != "0" && (_mapper.is_one_component_batch() != true))
-        {
-            const std::string dim0 = _mapper.tensor_component_y();
-            const std::string dim1 = _mapper.tensor_component_z();
-            coord_y += " + (";
-            coord_y += b + ")";
-            coord_y += " * ";
-            coord_y += dim0;
-            coord_y += " * ";
-            coord_y += dim1;
-        }
-        coord_y += ")";
-        return "(int2)(" + coord_x + ", " + coord_y + ")";
-    }
-};
-
-/** IGpuLoadStoreHelperWriter factory class */
-class ClLoadStoreHelperWriterFactory final
-{
-public:
-    /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
-     *
-     *
-     * @return IGpuLoadStoreHelperWriter
-     */
-    static std::unique_ptr<IGpuLoadStoreHelperWriter>
-    create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
-    {
-        const auto tensor_storage = mapper.gpu_sampler().storage;
-        switch (tensor_storage)
-        {
-            case GpuSamplerTensorStorage::BufferUint8Ptr:
-                return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
-            case GpuSamplerTensorStorage::Image2dReadOnly:
-            case GpuSamplerTensorStorage::Image2dWriteOnly:
-                return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
-            default:
-                std::cout << "Unsupported Gpu tensor storage" << std::endl;
-                assert(false);
-                return nullptr;
-        }
-    }
-};
-
-// This utility method needs to go in utils.h
-inline bool is_tile_scalar(const IVectorTile *x)
-{
-    return x->format().w == 1 && x->format().h == 1;
-}
-
-class ClKernelWriter : public IGpuKernelWriter
-{
-public:
-    ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
-    {
-        _data = x;
-        _attr = attr;
-    }
-
-    ClKernelWriter(const ClKernelWriter &) = default;
-
-    ClKernelWriter &operator=(const ClKernelWriter &) = default;
-
-    // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
-    // there are no conflicts or ambiguity in the code
-    void set_IdSpace(int32_t id) override
-    {
-        _data->tiles.set_IdSpace(id);
-        _data->arguments.set_IdSpace(id);
-    }
-
-    void import_tile(const std::string &dst_name, const IVectorTile *src) override
-    {
-        _data->tiles.insert(dst_name, src);
-    }
-
-    void declare_argument(const std::string &name, const TensorInfo &tensor) override
-    {
-        assert(_data->arguments[name] == nullptr);
-        _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
-    }
-
-    void declare_tile(const std::string &name, const TileInfo &format) override
-    {
-        assert(_data->tiles[name] == nullptr);
-        _data->tiles.insert(name, format);
-
-        IVectorTile *x = _data->tiles[name];
-
-        for (auto &t : x->underlying_source_variables())
-        {
-            _data->code += t.type.str + " " + t.str + ";\n";
-        }
-    }
-
-    void
-    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) override
-    {
-        assert(_data->tiles[name] == nullptr);
-        _data->tiles.insert(name, in, dt);
-        // Note: A constant does not need to be declared in the code
-    }
-
-    void write_text(const std::string &x) override
-    {
-        _data->code += x;
-    }
-
-    void compound_statement_begin() override
-    {
-        _data->tiles.increment_registry_level();
-        _data->code += "{\n";
-    }
-
-    void compound_statement_end() override
-    {
-        _data->tiles.decrement_registry_level();
-        _data->code += "}\n";
-    }
-
-    void op_get_global_id(const Operand &dst_var, int32_t dim) override
-    {
-        assert(dst_var.type() == OperandType::Tile);
-        assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
-               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
-        auto var = _data->tiles[dst_var.value()];
-
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " = get_global_id(";
-        _data->code += std::to_string(dim);
-        _data->code += ");\n";
-    };
-
-    void op_get_global_coord(const Operand       &o_dst,
-                             const Operand       &o_step,
-                             const TensorOperand &o_tensor,
-                             int32_t              dim) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto            dst  = operands.unpack(o_dst);
-        auto            step = operands.unpack(o_step);
-
-        // Validation: Check that x, y and z are scalar
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto                  tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        switch (dim)
-        {
-            case 0:
-                if (mapper.is_one_component_x())
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = 0;\n";
-                }
-                else
-                {
-                    if (mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-                    {
-                        // Validation: Check: fixed tensor shape
-                        // TO BE CHANGED
-                        _data->code += dst->scalar(0, 0).str;
-                        _data->code += " = get_global_id(0) * ";
-                        _data->code += step->scalar(0, 0).str;
-                        _data->code += ";\n";
-                    }
-                    else
-                    {
-                        _data->code += dst->scalar(0, 0).str;
-                        _data->code += " = get_global_id(0) * ";
-                        _data->code += step->scalar(0, 0).str;
-                        _data->code += ";\n";
-                    }
-                }
-                break;
-            case 1:
-                if (mapper.is_one_component_y())
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = 0;\n";
-                }
-                else
-                {
-                    if (mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
-                    {
-                    }
-                    else
-                    {
-                        _data->code += dst->scalar(0, 0).str;
-                        _data->code += " = get_global_id(1) * ";
-                        _data->code += step->scalar(0, 0).str;
-                        _data->code += ";\n";
-                    }
-                }
-                break;
-            case 2:
-                if (mapper.is_one_component_z())
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = 0;\n";
-                }
-                else
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = get_global_id(2) * ";
-                    _data->code += step->scalar(0, 0).str;
-                    _data->code += ";\n";
-                }
-                break;
-            default:
-                break;
-        }
-    };
-
-    void op_get_global_batch(const Operand &o_dst, const TensorOperand &o_tensor) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *dst = operands.unpack(o_dst);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        if (mapper.is_one_component_batch())
-        {
-            _data->code += dst->scalar(0, 0).str;
-            _data->code += " = 0;\n";
-        }
-        else
-        {
-            std::cout << "Unsupported batched computation" << std::endl;
-            assert(false);
-        }
-    };
-
-    void op_get_global_size(const Operand &dst_var, int32_t dim) override
-    {
-        assert(dst_var.type() == OperandType::Tile);
-        assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
-               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
-        auto var = _data->tiles[dst_var.value()];
-
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " = get_global_size(";
-        _data->code += std::to_string(dim);
-        _data->code += ");\n";
-    }
-
-    void op_unary_expression(const Operand &dst_name, UnaryOp op, const Operand &src_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(src_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t     dst_w = dst->format().w;
-        const int32_t     dst_h = dst->format().h;
-        const int32_t     src_w = src->format().w;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
-        const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += to_string(op);
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ";\n";
-        }
-    }
-
-    void op_binary_expression(const Operand &dst_name,
-                              const Operand &lhs_name,
-                              BinaryOp       op,
-                              const Operand &rhs_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *lhs = operands.unpack(lhs_name);
-        const IVectorTile *rhs = operands.unpack(rhs_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t dst_w = dst->format().w;
-        const int32_t dst_h = dst->format().h;
-        assert(lhs != nullptr);
-        const int32_t lhs_w = lhs->format().w;
-        const int32_t rhs_w = rhs->format().w;
-
-        if (op == BinaryOp::MatMul_Nt_T)
-        {
-            assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
-            for (int32_t y = 0; y < dst_h; ++y)
-            {
-                for (int32_t x = 0; x < dst_w; ++x)
-                {
-                    for (int32_t k = 0; k < lhs_w; ++k)
-                    {
-                        _data->code += dst->scalar(x, y).str;
-                        _data->code += " = fma(";
-                        _data->code += lhs->scalar(k, y).str;
-                        _data->code += ", ";
-                        _data->code += rhs->scalar(k, x).str;
-                        _data->code += ", ";
-                        _data->code += dst->scalar(x, y).str;
-                        _data->code += ");\n";
-                    }
-                }
-            }
-
-            return;
-        }
-
-        const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
-        const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
-
-        const std::string lhs_prefix =
-            broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string rhs_prefix =
-            broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string op_str = to_string(op);
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += lhs_prefix + lhs->vector(y).str;
-            _data->code += " ";
-            _data->code += op_str;
-            _data->code += " ";
-            _data->code += rhs_prefix + rhs->vector(y).str;
-            _data->code += ";\n";
-        }
-    };
-
-    void op_cast_expression(const Operand &o_dst, const Operand &o_src, ConvertPolicy policy) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(o_src);
-        const IVectorTile *dst = operands.unpack(o_dst);
-        // const int32_t dst_w  = dst->format().w;
-        const int32_t     dst_h    = dst->format().h;
-        const std::string dt       = dst->underlying_source_variables()[0].type.str;
-        const bool        is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
-        const std::string sat      = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = convert_" + dt + sat + "(";
-            _data->code += src->vector(y).str;
-            _data->code += ");\n";
-        }
-    };
-
-    void op_assign(const Operand &dst_name, const Operand &src_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(src_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t     dst_w = dst->format().w;
-        const int32_t     dst_h = dst->format().h;
-        const int32_t     src_w = src->format().w;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
-        const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ";\n";
-        }
-    }
-
-    void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(src_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t     dst_h = dst->format().h;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        // Always perform an explicit cast. This automatically covers at least the 2 scenarios:
-        // 1. Widen a scalar into a vector type. This enables scalar-vector broadcasting
-        // 2. Ensure non-ambiguity over function overloads.
-        //    E.g. a constant tile may be accidentally initialized with a double literal. By casting it to single float,
-        //    it avoids ambiguous function calls
-        const std::string src_prefix = "(" + dt + ")";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch (func)
-            {
-                case UnaryFunction::Exp:
-                    _data->code += "exp(";
-                    break;
-                case UnaryFunction::Tanh:
-                    _data->code += "tanh(";
-                    break;
-                case UnaryFunction::Sqrt:
-                    _data->code += "sqrt(";
-                    break;
-                case UnaryFunction::Erf:
-                    _data->code += "erf(";
-                    break;
-                case UnaryFunction::Fabs:
-                    _data->code += "fabs(";
-                    break;
-                case UnaryFunction::Log:
-                    _data->code += "log(";
-                    break;
-                case UnaryFunction::SizeOf:
-                    _data->code += "sizeof(";
-                    break;
-                case UnaryFunction::Round:
-                    _data->code += "round(";
-                    break;
-                case UnaryFunction::Floor:
-                    _data->code += "floor(";
-                    break;
-                default:
-                    CKW_ASSERT_MSG(false, "Unexpected UnaryFunction used.");
-            }
-
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_binary_elementwise_function(const Operand &dst_name,
-                                        BinaryFunction func,
-                                        const Operand &first_name,
-                                        const Operand &second_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *first  = operands.unpack(first_name);
-        const IVectorTile *second = operands.unpack(second_name);
-        const IVectorTile *dst    = operands.unpack(dst_name);
-
-        const int32_t     dst_h        = dst->format().h;
-        const auto        datatype     = dst->underlying_source_variables()[0].type;
-        const std::string datatype_str = datatype.str;
-
-        // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
-        const std::string first_prefix  = "(" + datatype_str + ")";
-        const std::string second_prefix = "(" + datatype_str + ")";
-
-        const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16);
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch (func)
-            {
-                case BinaryFunction::Min:
-                    _data->code += is_float ? "fmin(" : "min(";
-                    break;
-                case BinaryFunction::Max:
-                    _data->code += is_float ? "fmax(" : "max(";
-                    break;
-                default:
-                    CKW_ASSERT_MSG(false, "Unexpected BinaryFunction used.");
-            }
-
-            _data->code += first_prefix + first->vector(y).str;
-            _data->code += ", ";
-            _data->code += second_prefix + second->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_ternary_elementwise_function(const Operand  &dst_name,
-                                         TernaryFunction func,
-                                         const Operand  &first_name,
-                                         const Operand  &second_name,
-                                         const Operand  &third_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *first  = operands.unpack(first_name);
-        const IVectorTile *second = operands.unpack(second_name);
-        const IVectorTile *third  = operands.unpack(third_name);
-        const IVectorTile *dst    = operands.unpack(dst_name);
-
-        const int32_t     dst_h = dst->format().h;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
-        const std::string first_prefix  = "(" + dt + ")";
-        const std::string second_prefix = "(" + dt + ")";
-        const std::string third_prefix  = "(" + dt + ")";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch (func)
-            {
-                case TernaryFunction::Select:
-                    _data->code += "select(";
-                    break;
-                case TernaryFunction::Clamp:
-                    _data->code += "clamp(";
-                    break;
-                default:
-                    CKW_ASSERT_MSG(false, "Unexpected TernaryFunction used.");
-            }
-
-            _data->code += first_prefix + first->vector(y).str;
-            _data->code += ", ";
-            _data->code += second_prefix + second->vector(y).str;
-            _data->code += ", ";
-            _data->code += third_prefix + third->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *lhs = operands.unpack(o_lhs);
-        const IVectorTile *rhs = operands.unpack(o_rhs);
-
-        assert(is_tile_scalar(lhs));
-        assert(is_tile_scalar(rhs));
-
-        _data->code += "if(";
-        _data->code += lhs->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(op);
-        _data->code += " ";
-        _data->code += rhs->scalar(0, 0).str;
-        _data->code += ")\n";
-    }
-
-    void op_else_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override
-    {
-        _data->code += "else ";
-        op_if_header(o_lhs, op, o_rhs);
-    }
-
-    void op_else_header() override
-    {
-        _data->code += "else\n";
-    }
-
-    void op_for_loop_header(const Operand &var_name,
-                            BinaryOp       cond_op,
-                            const Operand &cond_value_name,
-                            const Operand &update_var_name,
-                            AssignmentOp   update_op,
-                            const Operand &update_value_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *var          = operands.unpack(var_name);
-        const IVectorTile *cond_value   = operands.unpack(cond_value_name);
-        const IVectorTile *update_var   = operands.unpack(update_var_name);
-        const IVectorTile *update_value = operands.unpack(update_value_name);
-
-        const int32_t dst_w = var->format().w;
-        const int32_t dst_h = var->format().h;
-
-        // It must be a scalar variable
-        CKW_UNUSED(dst_w, dst_h);
-        assert(dst_w == 1);
-        assert(dst_h == 1);
-
-        _data->code += "for(; ";
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(cond_op);
-        _data->code += " " + cond_value->scalar(0, 0).str + "; ";
-        _data->code += update_var->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(update_op);
-        _data->code += " " + update_value->scalar(0, 0).str + ")";
-        _data->code += "\n";
-    }
-
-    void op_load_immediate(const TensorOperand &o_tensor,
-                           const Operand       &o_dst,
-                           const Operand       &o_x,
-                           const Operand       &o_y,
-                           const Operand       &o_z,
-                           const Operand       &o_batch_idx,
-                           const Operand       &dilation_y) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-
-        // Not const as it requires changes to 'load_writer'.
-        IVectorTile *dst   = operands.unpack(o_dst);
-        IVectorTile *x     = operands.unpack(o_x);
-        IVectorTile *y     = operands.unpack(o_y);
-        IVectorTile *z     = operands.unpack(o_z);
-        IVectorTile *dil_y = operands.unpack(dilation_y);
-        IVectorTile *b     = operands.unpack(o_batch_idx);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
-        // Initialize the constant part
-        load_writer->initialize(dst, x, z, b);
-
-        for (int i = 0; i < dst->format().h; ++i)
-        {
-            std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
-            if (dil_y->scalar(0, 0).str != "1")
-            {
-                coord_y += " * " + dil_y->scalar(0, 0).str;
-            }
-            load_writer->write(std::make_pair(i, coord_y));
-        }
-
-        load_writer->finalize();
-    }
-
-    void op_load_indirect(const TensorOperand &o_tensor,
-                          const Operand       &o_dst,
-                          const Operand       &o_x,
-                          const Operand       &o_indirect_h,
-                          const Operand       &o_z,
-                          const Operand       &o_batch_idx) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-
-        // Not const as it requires changes to 'load_writer'.
-        IVectorTile *dst   = operands.unpack(o_dst);
-        IVectorTile *x     = operands.unpack(o_x);
-        IVectorTile *y_ind = operands.unpack(o_indirect_h);
-        IVectorTile *z     = operands.unpack(o_z);
-        IVectorTile *b     = operands.unpack(o_batch_idx);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
-        // Initialize the constant part
-        load_writer->initialize(dst, x, z, b);
-
-        for (int i = 0; i < dst->format().h; ++i)
-        {
-            load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
-        }
-
-        load_writer->finalize();
-    }
-
-    void op_store_immediate(const TensorOperand &tensor_name,
-                            const Operand       &src_name,
-                            const Operand       &x_name,
-                            const Operand       &y_name,
-                            const Operand       &z_name,
-                            const Operand       &batch_index_name) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-
-        // Not const as it requires changes to 'load_writer'.
-        IVectorTile *src = operands.unpack(src_name);
-        IVectorTile *x   = operands.unpack(x_name);
-        IVectorTile *y   = operands.unpack(y_name);
-        IVectorTile *z   = operands.unpack(z_name);
-        IVectorTile *b   = operands.unpack(batch_index_name);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(tensor_name);
-        auto                  gpu_sampler = tensor_name.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
-
-        // Initialize the constant part
-        store_writer->initialize(src, x, z, b);
-
-        int32_t tile_h = src->format().h;
-
-        for (int m0 = tile_h - 1; m0 >= 0; m0--)
-        {
-            store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
-        }
-
-        store_writer->finalize();
-    }
-
-    void op_return() override
-    {
-        _data->code += "return;\n";
-    }
-
-    void util_get_indirect_buffer(const Operand       &o_dst,
-                                  const TensorOperand &o_tensor,
-                                  const Operand       &o_x,
-                                  const Operand       &o_y,
-                                  const Operand       &o_x_off,
-                                  const Operand       &o_y_off) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *dst   = operands.unpack(o_dst);
-        const IVectorTile *x     = operands.unpack(o_x);
-        const IVectorTile *y     = operands.unpack(o_y);
-        const IVectorTile *x_off = operands.unpack(o_x_off);
-        const IVectorTile *y_off = operands.unpack(o_y_off);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor = tensor_operands.unpack(o_tensor);
-
-        assert(dst->format().w == 1);
-        assert(x->format().w == 1);
-        assert(y->format().w == 1);
-        assert(x_off->format().w == 1);
-        assert(y_off->format().w == 1);
-        assert(dst->format().dt == DataType::Int32);
-        assert(x->format().dt == DataType::Int32);
-        assert(y->format().dt == DataType::Int32);
-        assert(x_off->format().dt == DataType::Int32);
-        assert(y_off->format().dt == DataType::Int32);
-
-        const std::string width  = tensor->component(TensorComponentType::Dim1);
-        const std::string height = tensor->component(TensorComponentType::Dim2);
-        const std::string wxh    = tensor->component(TensorComponentType::Dim1xDim2);
-        /*
-        int x_s;
-        int y_s;
-        x_s = (xi_0 + x_k);
-        y_s = (yi_0 + y_k);
-        mi_0 = x_s + y_s * width + b * widthxheight;
-        mi_0 = select(-1, mi_0, x_s >= 0);
-        mi_0 = select(-1, mi_0, y_s >= 0);
-        mi_0 = select(-1, mi_0, x_s < 128);
-        mi_0 = select(-1, mi_0, y_s < 128);
-        */
-        compound_statement_begin();
-        declare_tile("_x_s", TileInfo(DataType::Int32));
-        declare_tile("_y_s", TileInfo(DataType::Int32));
-        auto x_s = operands.unpack(Operand("_x_s"));
-        auto y_s = operands.unpack(Operand("_y_s"));
-        for (int i = 0; i < dst->format().h; ++i)
-        {
-            // x_s = (xi_0 + x_k);
-            // y_s = (yi_0 + y_k);
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " = (";
-            _data->code += x->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += x_off->scalar(0, i).str;
-            _data->code += ");\n";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " = (";
-            _data->code += y->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += y_off->scalar(0, i).str;
-            _data->code += ");\n";
-            // mi_0 = x_s + y_s * width;
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " * " + width + ";\n";
-            // mi_0 = select(wxh, mi_0, x_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, x_s < width);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " < ";
-            _data->code += width + ");\n";
-            // mi_0 = select(wxh, mi_0, y_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, y_s < height);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " < ";
-            _data->code += height + ");\n";
-        }
-        compound_statement_end();
-    }
-
-private:
-    GpuKernelWriterDataHolder *_data{nullptr};
-    GpuKernelWriterAttribute  *_attr{nullptr};
-};
-
-/** IGpuKernelWriter factory class */
-class GpuKernelWriterFactory final
-{
-public:
-    /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return IGpuKernelWriter
-     */
-    static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
-    {
-        switch (x->programming_language())
-        {
-            case GpuTargetLanguage::OpenCL:
-                return std::make_unique<ClKernelWriter>(attr, x);
-            default:
-                std::cout << "Unsupported Gpu programming language" << std::endl;
-                assert(false);
-                return nullptr;
-        }
-    }
-};
-
-inline int32_t
-adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
-{
-    auto tensor = tensor_info_id->shape;
-
-    int32_t dim[3] = {0};
-
-    switch (tensor_format)
-    {
-        case TensorSamplerFormat::C_W_H:
-            dim[0] = tensor[0];
-            dim[1] = tensor[1];
-            dim[2] = tensor[2];
-            break;
-        case TensorSamplerFormat::C_WH_1:
-            dim[0] = tensor[0];
-            dim[1] = tensor[1] * tensor[2];
-            dim[2] = 1;
-            break;
-        default:
-            std::cout << "Unsupported tensor format" << std::endl;
-            assert(false);
-            break;
-    }
-
-    return std::min(step, dim[idx]);
-}
-
-} // namespace prototype
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H
diff --git a/compute_kernel_writer/prototype/src/TensorInfo.cpp b/compute_kernel_writer/prototype/src/TensorInfo.cpp
deleted file mode 100644
index 561c126469..0000000000
--- a/compute_kernel_writer/prototype/src/TensorInfo.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorInfo.h"
-
-namespace ckw
-{
-TensorInfo::TensorInfo(DataType dt, const TensorShape &shape, TensorDataLayout dl, int32_t id)
-    : _shape(shape), _dt(dt), _dl(dl), _id(id)
-{
-}
-
-TensorInfo &TensorInfo::shape(const TensorShape &shape)
-{
-    _shape = shape;
-    return *this;
-}
-
-TensorShape TensorInfo::shape() const
-{
-    return _shape;
-}
-
-TensorInfo &TensorInfo::data_type(DataType dt)
-{
-    _dt = dt;
-    return *this;
-}
-
-DataType TensorInfo::data_type() const
-{
-    return _dt;
-}
-
-TensorInfo &TensorInfo::data_layout(TensorDataLayout dl)
-{
-    _dl = dl;
-    return *this;
-}
-
-TensorDataLayout TensorInfo::data_layout() const
-{
-    return _dl;
-}
-
-TensorInfo &TensorInfo::id(int32_t id)
-{
-    _id = id;
-    return *this;
-}
-
-int32_t TensorInfo::id() const
-{
-    return _id;
-}
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TensorOperand.cpp b/compute_kernel_writer/prototype/src/TensorOperand.cpp
deleted file mode 100644
index d1aefbbb71..0000000000
--- a/compute_kernel_writer/prototype/src/TensorOperand.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorOperand.h"
-
-#include "ckw/Error.h"
-#include "ckw/Kernel.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TileOperand.h"
-
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-namespace
-{
-
-TensorComponentOperand &get_or_create_component(TensorOperand                           &tensor,
-                                                std::unique_ptr<TensorComponentOperand> &ptr,
-                                                TensorComponentType                      component)
-{
-    if (ptr == nullptr)
-    {
-        ptr = std::make_unique<TensorComponentOperand>(tensor, component);
-    }
-
-    return *ptr;
-}
-
-} // namespace
-
-// =================================================================================================
-// TensorOperand
-// =================================================================================================
-
-TensorOperand::TensorOperand(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
-    : OperandBase(name), _info(info), _storage_type(storage_type)
-{
-}
-
-prototype::Operand TensorOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-    return {name()};
-}
-
-const TensorInfo &TensorOperand::info() const
-{
-    return _info;
-}
-
-TensorInfo &TensorOperand::info()
-{
-    return _info;
-}
-
-TensorStorageType TensorOperand::storage_type() const
-{
-    return _storage_type;
-}
-
-DataType TensorOperand::data_type() const
-{
-    return _info.data_type();
-}
-
-bool TensorOperand::is_constant() const
-{
-    return false;
-}
-
-const TileOperand &TensorOperand::tile() const
-{
-    return *_tile;
-}
-
-TileOperand &TensorOperand::tile()
-{
-    return *_tile;
-}
-
-TensorOperand &TensorOperand::tile(TileOperand &tile)
-{
-    _tile = &tile;
-    return *this;
-}
-
-const TensorTileSampler &TensorOperand::tile_sampler() const
-{
-    return _tile_sampler;
-}
-
-TensorTileSampler &TensorOperand::tile_sampler()
-{
-    return _tile_sampler;
-}
-
-TensorOperand &TensorOperand::tile_sampler(const TensorTileSampler &value)
-{
-    _tile_sampler = value;
-    return *this;
-}
-
-TensorComponentOperand &TensorOperand::stride1()
-{
-    return get_or_create_component(*this, _stride1, TensorComponentType::Stride1);
-}
-
-TensorComponentOperand &TensorOperand::stride2()
-{
-    return get_or_create_component(*this, _stride2, TensorComponentType::Stride2);
-}
-
-TensorComponentOperand &TensorOperand::stride3()
-{
-    return get_or_create_component(*this, _stride3, TensorComponentType::Stride3);
-}
-
-TensorComponentOperand &TensorOperand::stride4()
-{
-    return get_or_create_component(*this, _stride4, TensorComponentType::Stride4);
-}
-
-TensorComponentOperand &TensorOperand::dim0()
-{
-    return get_or_create_component(*this, _dim0, TensorComponentType::Dim0);
-}
-
-TensorComponentOperand &TensorOperand::dim1()
-{
-    return get_or_create_component(*this, _dim1, TensorComponentType::Dim1);
-}
-
-TensorComponentOperand &TensorOperand::dim2()
-{
-    return get_or_create_component(*this, _dim2, TensorComponentType::Dim2);
-}
-
-TensorComponentOperand &TensorOperand::dim3()
-{
-    return get_or_create_component(*this, _dim3, TensorComponentType::Dim3);
-}
-
-TensorComponentOperand &TensorOperand::dim4()
-{
-    return get_or_create_component(*this, _dim4, TensorComponentType::Dim4);
-}
-
-TensorComponentOperand &TensorOperand::dim1_dim2()
-{
-    return get_or_create_component(*this, _dim1_dim2, TensorComponentType::Dim1xDim2);
-}
-
-TensorComponentOperand &TensorOperand::dim1_dim2_dim3()
-{
-    return get_or_create_component(*this, _dim1_dim2_dim3, TensorComponentType::Dim1xDim2xDim3);
-}
-
-TensorComponentOperand &TensorOperand::offset_first_element_in_bytes()
-{
-    return get_or_create_component(*this, _offset_first_element_in_bytes, TensorComponentType::OffsetFirstElement);
-}
-
-// =================================================================================================
-// TensorComponentOperand
-// =================================================================================================
-
-TensorComponentOperand::TensorComponentOperand(TensorOperand &tensor, TensorComponentType component)
-    : TileOperand(tensor.name(), DataType::Int32), _tensor(tensor), _component(component)
-{
-}
-
-TensorOperand &TensorComponentOperand::tensor()
-{
-    return _tensor;
-}
-
-const TensorOperand &TensorComponentOperand::tensor() const
-{
-    return _tensor;
-}
-
-TensorComponentType TensorComponentOperand::component_type() const
-{
-    return _component;
-}
-
-prototype::Operand TensorComponentOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-    prototype::OperandType type{prototype::OperandType::Unknown};
-
-    switch (_component)
-    {
-        case TensorComponentType::OffsetFirstElement:
-            type = prototype::OperandType::TensorDataOffset;
-            break;
-
-        case TensorComponentType::Stride1:
-            type = prototype::OperandType::TensorStride1;
-            break;
-
-        case TensorComponentType::Stride2:
-            type = prototype::OperandType::TensorStride2;
-            break;
-
-        case TensorComponentType::Stride3:
-            type = prototype::OperandType::TensorStride3;
-            break;
-
-        case TensorComponentType::Stride4:
-            type = prototype::OperandType::TensorStride4;
-            break;
-
-        case TensorComponentType::Dim0:
-            type = prototype::OperandType::TensorDim0;
-            break;
-
-        case TensorComponentType::Dim1:
-            type = prototype::OperandType::TensorDim1;
-            break;
-
-        case TensorComponentType::Dim2:
-            type = prototype::OperandType::TensorDim2;
-            break;
-
-        case TensorComponentType::Dim3:
-            type = prototype::OperandType::TensorDim3;
-            break;
-
-        case TensorComponentType::Dim4:
-            type = prototype::OperandType::TensorDim4;
-            break;
-
-        case TensorComponentType::Dim1xDim2:
-            type = prototype::OperandType::TensorDim1xDim2;
-            break;
-
-        case TensorComponentType::Dim1xDim2xDim3:
-            type = prototype::OperandType::TensorDim1xDim2xDim3;
-            break;
-
-        default:
-            CKW_ASSERT(false);
-    }
-
-    return prototype::Operand(name(), type);
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
deleted file mode 100644
index bf9f946ce8..0000000000
--- a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorTileSampler.h"
-
-#include "ckw/TileOperand.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
-namespace ckw
-{
-
-TensorTileSampler::TensorTileSampler()
-{
-}
-
-TensorTileSampler::TensorTileSampler(TileOperand              &x,
-                                     TileOperand              &y,
-                                     TileOperand              &z,
-                                     TileOperand              &b,
-                                     TensorSamplerFormat       format,
-                                     TensorSamplerAddressModeX address_mode_x,
-                                     TensorSamplerAddressModeY address_mode_y,
-                                     TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x),
-      _y(&y),
-      _z(&z),
-      _b(&b),
-      _height(0),
-      _width(0),
-      _format(format),
-      _address_mode_x(address_mode_x),
-      _address_mode_y(address_mode_y),
-      _address_mode_z(address_mode_z)
-{
-}
-
-TensorTileSampler::TensorTileSampler(TileOperand              &x,
-                                     TileOperand              &y,
-                                     TileOperand              &z,
-                                     TileOperand              &b,
-                                     int32_t                   height,
-                                     int32_t                   width,
-                                     TensorSamplerFormat       format,
-                                     TensorSamplerAddressModeX address_mode_x,
-                                     TensorSamplerAddressModeY address_mode_y,
-                                     TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x),
-      _y(&y),
-      _z(&z),
-      _b(&b),
-      _height(height),
-      _width(width),
-      _format(format),
-      _address_mode_x(address_mode_x),
-      _address_mode_y(address_mode_y),
-      _address_mode_z(address_mode_z)
-{
-}
-
-const TileOperand &TensorTileSampler::x() const
-{
-    return *_x;
-}
-
-TensorTileSampler &TensorTileSampler::x(TileOperand &x)
-{
-    _x = &x;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::y() const
-{
-    return *_y;
-}
-
-TensorTileSampler &TensorTileSampler::y(TileOperand &y)
-{
-    _y = &y;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::z() const
-{
-    return *_z;
-}
-
-TensorTileSampler &TensorTileSampler::z(TileOperand &z)
-{
-    _z = &z;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::b() const
-{
-    return *_b;
-}
-
-TensorTileSampler &TensorTileSampler::b(TileOperand &b)
-{
-    _b = &b;
-    return *this;
-}
-
-int32_t TensorTileSampler::width() const
-{
-    return _width;
-}
-
-TensorTileSampler &TensorTileSampler::width(int32_t width)
-{
-    _width = width;
-    return *this;
-}
-
-int32_t TensorTileSampler::height() const
-{
-    return _height;
-}
-
-TensorTileSampler &TensorTileSampler::height(int32_t height)
-{
-    _height = height;
-    return *this;
-}
-
-TensorSamplerFormat TensorTileSampler::format() const
-{
-    return _format;
-}
-
-TensorTileSampler &TensorTileSampler::format(TensorSamplerFormat format)
-{
-    _format = format;
-    return *this;
-}
-
-TensorSamplerAddressModeX TensorTileSampler::address_mode_x() const
-{
-    return _address_mode_x;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_x(TensorSamplerAddressModeX address_mode_x)
-{
-    _address_mode_x = address_mode_x;
-    return *this;
-}
-
-TensorSamplerAddressModeY TensorTileSampler::address_mode_y() const
-{
-    return _address_mode_y;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_y(TensorSamplerAddressModeY address_mode_y)
-{
-    _address_mode_y = address_mode_y;
-    return *this;
-}
-
-TensorSamplerAddressModeZ TensorTileSampler::address_mode_z() const
-{
-    return _address_mode_z;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_z(TensorSamplerAddressModeZ address_mode_z)
-{
-    _address_mode_z = address_mode_z;
-    return *this;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TileInfo.cpp b/compute_kernel_writer/prototype/src/TileInfo.cpp
deleted file mode 100644
index 273266eedc..0000000000
--- a/compute_kernel_writer/prototype/src/TileInfo.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TileInfo.h"
-
-namespace ckw
-{
-TileInfo::TileInfo(DataType dt) : _dt(dt), _shape({{1, 1}})
-{
-}
-
-TileInfo::TileInfo(DataType dt, int32_t w) : _dt(dt), _shape({{w, 1}})
-{
-}
-
-TileInfo::TileInfo(DataType dt, int32_t h, int32_t w) : _dt(dt), _shape({{w, h}})
-{
-}
-
-TileInfo &TileInfo::width(int32_t w)
-{
-    _shape[kTileWidthIdx] = w;
-    return *this;
-}
-
-int32_t TileInfo::width() const
-{
-    return _shape[kTileWidthIdx];
-}
-
-TileInfo &TileInfo::height(int32_t h)
-{
-    _shape[kTileHeightIdx] = h;
-    return *this;
-}
-
-int32_t TileInfo::height() const
-{
-    return _shape[kTileHeightIdx];
-}
-
-TileInfo &TileInfo::data_type(DataType dt)
-{
-    _dt = dt;
-    return *this;
-}
-
-DataType TileInfo::data_type() const
-{
-    return _dt;
-}
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
deleted file mode 100644
index e09c833d96..0000000000
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TileOperand.h"
-
-#include "ckw/Error.h"
-
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name), _info(info), _value{std::vector<std::string>{"0"}}, _constant(false)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name), _info(TileInfo{data_type}), _value{std::vector<std::string>{"0"}}, _constant(false)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, int32_t value)
-    : OperandBase(name),
-      _info(TileInfo{DataType::Int32}),
-      _value{std::vector<std::string>{std::to_string(value)}},
-      _constant(true)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, float value)
-    : OperandBase(name),
-      _info(TileInfo{DataType::Fp32}),
-      _value{std::vector<std::string>{std::to_string(value)}},
-      _constant(true)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
-    : OperandBase(name),
-      _info(TileInfo{dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size())}),
-      _value(vals),
-      _constant(true)
-{
-}
-
-prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-
-    if (_constant)
-    {
-        if (is_scalar())
-        {
-            switch (_info.data_type())
-            {
-                case DataType::Int32:
-                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
-
-                case DataType::Fp32:
-                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp32);
-
-                case DataType::Fp16:
-                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp16);
-
-                default:
-                    CKW_ASSERT(false);
-            }
-        }
-        else
-        {
-            return prototype::Operand(name());
-        }
-    }
-    else
-    {
-        return prototype::Operand(name(), prototype::OperandType::Tile);
-    }
-}
-
-const TileInfo &TileOperand::tile_info() const
-{
-    return _info;
-}
-
-DataType TileOperand::data_type() const
-{
-    return _info.data_type();
-}
-
-bool TileOperand::is_constant() const
-{
-    return _constant;
-}
-
-bool TileOperand::is_scalar() const
-{
-    return _info.width() == 1 && _info.height() == 1;
-}
-
-std::string TileOperand::scalar_value() const
-{
-    CKW_ASSERT(is_scalar());
-    CKW_ASSERT(is_constant());
-
-    return _value[0][0];
-}
-
-const TileContainer &TileOperand::value() const
-{
-    return _value;
-}
-
-} // namespace ckw
diff --git a/filelist.json b/filelist.json
index 2f33b5cd5e..dcf3204ecd 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2324,7 +2324,6 @@
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
-        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
@@ -2339,8 +2338,6 @@
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
@@ -2361,21 +2358,6 @@
         "src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp"
       ],
-      "template_writer": [
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp"
-      ],
       "ckw_driver": [
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 1e1ab7f545..f244017dbd 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2017-2023 Arm Limited.
+# Copyright (c) 2017-2024 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -28,7 +28,7 @@ import re
 import sys
 
 def get_list_includes():
-    return "compute_kernel_writer/prototype/include " \
+    return "compute_kernel_writer/include " \
            "src/cpu/kernels/assembly " \
            "src/core/NEON/kernels/assembly " \
            "src/core/NEON/kernels/convolution/winograd " \
@@ -43,8 +43,6 @@ def get_list_flags( filename, arch):
     flags.append("-DARM_COMPUTE_OPENCL_ENABLED")
     if arch == "aarch64":
         flags.append("-DARM_COMPUTE_AARCH64_V8_2")
-    if "ckw_driver" in filename:
-        flags.append("-DACL_INTERNAL_TEST_CKW_IN_DF")
 
     return flags
 
diff --git a/scripts/generate_android_bp.py b/scripts/generate_android_bp.py
index f7ecbc468b..6efd072acd 100755
--- a/scripts/generate_android_bp.py
+++ b/scripts/generate_android_bp.py
@@ -108,6 +108,7 @@ cc_library_static {
     proprietary: true,
     local_include_dirs: ["build/android-arm64v8a/src/core",
                          "build/android-arm64v8a/src/core/CL",
+                         "compute_kernel_writer/include",
                          "src/core/common",
                          "src/core/helpers",
                          "src/core/NEON/kernels/arm_gemm",
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 9ca20fa152..eab5cddd07 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,11 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 
 #include "src/core/CL/CLUtils.h"
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
 #include "support/Cast.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -61,128 +60,6 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
     _arguments = code.arguments();
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-
-inline void ClKernelRuntime::add_tensor_argument(unsigned int                &idx,
-                                                 const GpuKernelArgumentInfo &arg,
-                                                 const ICLTensor             *tensor,
-                                                 const Window                &arg_slice,
-                                                 std::vector<cl::Image2D>    &cl_images)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-
-    switch (arg.type)
-    {
-        case GpuKernelArgumentInfo::Type::Scalar:
-        {
-            ARM_COMPUTE_ERROR("Unsupported yet");
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Vector:
-        {
-            add_1D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Image:
-        {
-            add_2D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D:
-        {
-            add_2D_tensor_argument(idx, tensor, arg_slice);
-            const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
-                                                                            tensor->info()->dimension(2) *
-                                                                            tensor->info()->dimension(3));
-            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d =
-                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
-                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
-            cl_images.push_back(tensor_image2d);
-            _kernel.setArg(idx++, tensor_image2d);
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Image_3D:
-        {
-            add_2D_tensor_argument(idx, tensor, arg_slice);
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
-                                                                            tensor->info()->dimension(2) *
-                                                                            tensor->info()->dimension(3));
-            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d =
-                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
-                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
-            cl_images.push_back(tensor_image2d);
-            _kernel.setArg(idx++, tensor_image2d);
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Tensor_3D:
-        {
-            add_3D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Tensor_4D:
-        {
-            add_4D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
-        {
-            add_4d_tensor_nhwc_argument(idx, tensor);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-        {
-            const size_t image_w        = tensor->info()->dimension(0) / 4;
-            const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
-            const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
-
-            cl::Image2D tensor_image2d = create_image2d_from_buffer(
-                CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h),
-                tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
-            cl_images.push_back(tensor_image2d);
-
-            _kernel.setArg(idx++, tensor_image2d);
-            add_4d_tensor_nhwc_argument(idx, tensor);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_Special_0:
-        {
-            const ITensorInfo *info    = tensor->info();
-            const Strides     &strides = info->strides_in_bytes();
-
-            _kernel.setArg(idx++, tensor->cl_buffer());
-            const size_t dim1xdim2 = info->tensor_shape()[1] * info->tensor_shape()[2];
-            _kernel.setArg<cl_int>(idx++, static_cast<int32_t>(dim1xdim2));
-            const size_t stride1 = strides[1];
-            _kernel.setArg<cl_int>(idx++, static_cast<int32_t>(stride1));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported");
-        }
-    }
-}
-
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
                                                  const GpuKernelArgumentBinding &arg,
                                                  const ICLTensor                *tensor,
@@ -234,7 +111,6 @@ inline void ClKernelRuntime::add_kernel_argument(unsigned int
     }
 }
 
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -253,17 +129,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
         // Set kernel arguments
         // CLImages created from tensor arguments. Need to be retained until enqueue
         std::vector<cl::Image2D> cl_images;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        for (auto id_arg : _arguments)
-        {
-            const auto arg    = id_arg.second;
-            auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
-            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
-            add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, slice, cl_images);
-        }
 
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
         for (const auto &arg : _arguments)
         {
             auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
@@ -271,7 +137,6 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
             add_kernel_argument(idx, arg, tensor, cl_images);
         }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
index e78567eb9d..148e4db581 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
-#define SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
@@ -59,21 +59,6 @@ public:
     virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Set a kernel tensor argument
-     *
-     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     arg       Kernel argument descriptor accompanying @p tensor
-     * @param[in]     tensor    Tensor to set as an argument of the object's kernel
-     * @param[in]     arg_slice Window the kernel will be run on
-     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
-     */
-    inline void add_tensor_argument(unsigned int                &idx,
-                                    const GpuKernelArgumentInfo &arg,
-                                    const ICLTensor             *tensor,
-                                    const Window                &arg_slice,
-                                    std::vector<cl::Image2D>    &cl_images);
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Set a kernel argument as part of a tensor
      *
      * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -85,7 +70,6 @@ private:
                                     const GpuKernelArgumentBinding &arg,
                                     const ICLTensor                *tensor,
                                     std::vector<cl::Image2D>       &cl_images);
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 private:
     GpuKernelArgumentList _arguments{};
@@ -94,4 +78,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME */
+#endif // ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
deleted file mode 100644
index 9cecfc2ffd..0000000000
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1)
-{
-    return info0.type == info1.type;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 03817173f4..c923bf9c16 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
 
 #include "arm_compute/core/TensorInfo.h"
 
@@ -32,96 +32,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-/** Contain information required to set up a kernel argument at run time
- * @deprecated To be removed along with ClTemplateWriter
- */
-struct GpuKernelArgumentInfo
-{
-    /** Enumerate all the tensor arguments variants used by all kernel implementations.  */
-    enum class Type : int
-    {
-        Scalar,
-
-        Vector,
-
-        Image,
-        Image_Reinterpret_As_3D,
-        Image_Export_To_ClImage2D,
-
-        Image_3D, // 3D Tensor represented as a 2D Image + stride_z
-        Image_3D_Export_To_ClImage2D,
-
-        Tensor_3D,
-        Tensor_4D,
-        Tensor_4D_t_Buffer,
-        Tensor_4D_t_Image,
-
-        Tensor_Special_0,
-    };
-    /** Default constructor */
-    GpuKernelArgumentInfo() = default;
-    /** Constructor */
-    GpuKernelArgumentInfo(Type type) : type{type}
-    {
-    }
-    Type type{Type::Tensor_4D_t_Buffer};
-};
-bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
-/** Kernel argument information linked with its corresponding @ref ITensorInfo
- * @deprecated To be removed along with ClTemplateWriter
- */
-class GpuKernelArgument
-{
-public:
-    /** Constructor
-     *
-     * @param[in] tensor_info     Associated @ref ITensorInfo
-     * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
-     */
-    GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info}
-    {
-    }
-    /** Get workload tensor id */
-    ITensorInfo::Id id() const
-    {
-        return _tensor_info.id();
-    }
-    /** Get associated @ref ITensorInfo */
-    ITensorInfo *tensor_info()
-    {
-        return &_tensor_info;
-    }
-    /** Get associated @ref ITensorInfo */
-    const ITensorInfo *tensor_info() const
-    {
-        return &_tensor_info;
-    }
-    /** Get associated @ref GpuKernelArgumentInfo */
-    GpuKernelArgumentInfo *kernel_argument_info()
-    {
-        return &_kernel_arg_info;
-    }
-    /** Get associated @ref GpuKernelArgumentInfo */
-    const GpuKernelArgumentInfo *kernel_argument_info() const
-    {
-        return &_kernel_arg_info;
-    }
-    /** Check if the associated workload tensor has valid id
-     *
-     * @return true if has valid id
-     * @return false  otherwise
-     */
-    bool has_valid_id() const
-    {
-        return _tensor_info.has_valid_id();
-    }
-
-private:
-    TensorInfo            _tensor_info{};
-    GpuKernelArgumentInfo _kernel_arg_info{};
-};
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 /** Describe how the tensor runtime memory can be accessed
  *
  * Please see documentation under @ref GpuKernelArgumentBinding
@@ -243,9 +153,8 @@ private:
     };
     Value _value;
 };
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 24812cd8a7..11d916eec9 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,19 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include <map>
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 #include <deque>
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include <string>
 
 namespace arm_compute
@@ -43,11 +39,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 /** The argument list of a @ref GpuKernelSourceCode */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 /** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
  */
@@ -132,4 +124,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index 502ceab807..725a46e91c 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,14 +26,10 @@
 #include "arm_compute/core/experimental/Types.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -41,8 +37,8 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
-    : _comp_group{components}, _store_components{}
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components) // NOLINT
+    : _comp_group{std::move(components)}, _store_components{}
 {
     ARM_COMPUTE_UNUSED(services);
 }
@@ -50,19 +46,11 @@ GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKern
 GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
 {
     GpuKernelSourceCode code;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    ClTemplateWriter writer{_comp_group};
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuCkwDriver writer{_comp_group};
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuCkwDriver        writer{_comp_group};
 
     code.name(writer.get_name());
     code.code(writer.get_code());
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    code.arguments(writer.get_tensors());
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     code.arguments(writer.get_kernel_arguments());
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     code.build_options(writer.get_build_options());
     code.config_id(writer.get_config_id());
     code.window(writer.get_window());
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
index 1fd40f0acd..e2bc83b286 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
@@ -52,7 +52,7 @@ public:
      * @param[in] services   @ref GpuComponentServices to be used
      * @param[in] components Component group from which this logical kernel is initialized
      */
-    explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components);
+    explicit GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components); // NOLINT
     /** Allow instances of this class to be copy constructed */
     GpuLogicalKernel(const GpuLogicalKernel &) = default;
     /** Allow instances of this class to be copied */
@@ -71,4 +71,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index 43bcc47fa0..5d75bcaaa0 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
@@ -36,7 +36,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 namespace
 {
 /** Extract kernel arguments of one tensor from a flat list of kernel arguments.
@@ -70,7 +69,6 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &
     return tensor_kargs;
 }
 } // namespace
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
 
@@ -83,25 +81,11 @@ class GpuWorkloadArgument
 public:
     /** Default constructor */
     GpuWorkloadArgument() = default;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Constructor
      *
-     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
-     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
-     * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument
-     */
-    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
-                        const MemoryDescriptor      &mem_desc,
-                        const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info}
-    {
-    }
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Constructor
-     *
-     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
-     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
-     * @param[in] kernel_arg_list @ref GpuKernelArgumentList of the workload argument
+     * @param[in] tensor_info @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc    @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_args @ref GpuKernelArgumentList of the workload argument
      */
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
@@ -109,7 +93,6 @@ public:
         : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
     {
     }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get tensor id within workload */
     ITensorInfo::Id id() const
     {
@@ -135,18 +118,6 @@ public:
     {
         return &_mem_desc;
     }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Get @ref GpuKernelArgumentInfo of the argument */
-    GpuKernelArgumentInfo *kernel_argument_info()
-    {
-        return &_kernel_arg_info;
-    }
-    /** Get @ref GpuKernelArgumentInfo of the argument */
-    const GpuKernelArgumentInfo *kernel_argument_info() const
-    {
-        return &_kernel_arg_info;
-    }
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get @ref GpuKernelArgumentList of the workload tensor */
     GpuKernelArgumentList *kernel_argument_list()
     {
@@ -157,7 +128,6 @@ public:
     {
         return &_kernel_args;
     }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Check if the workload argument has valid id
      *
      * @return true   If has valid id
@@ -169,13 +139,9 @@ public:
     }
 
 private:
-    TensorInfo       _tensor_info{};
-    MemoryDescriptor _mem_desc{};
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentInfo _kernel_arg_info{};
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    TensorInfo            _tensor_info{};
+    MemoryDescriptor      _mem_desc{};
     GpuKernelArgumentList _kernel_args{};
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 /** Describes when a unit workload is run.
@@ -259,22 +225,7 @@ public:
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
         const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
         _unit_workloads.push_back(unit_work);
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        ARM_COMPUTE_UNUSED(context);
-        // Assemble kernel argument with memory descriptor to form workload argument
-        for (const auto &id_arg : kernel_code.arguments())
-        {
-            const auto arg_id = id_arg.first;
-            const auto arg    = id_arg.second;
-            _workload_arguments[arg_id] =
-                GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()};
-            if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
-            {
-                _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
-            }
-            _tensor_uwork_map[arg_id].insert(uwk_id);
-        }
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+
         GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
         GpuKernelArgumentList tensor_kargs{};
         while (true)
@@ -296,7 +247,7 @@ public:
                 _tensor_uwork_map[tensor_id].insert(uwk_id);
             }
         }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
         return uwk_id;
     }
     /** Get a unit workload from its id */
@@ -346,4 +297,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index ad474674f9..84972501de 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
@@ -62,23 +62,14 @@ public:
     virtual std::string get_config_id() = 0;
     /** Generate execution window */
     virtual Window get_window() const = 0;
-    /** Get the kernel argument lists of the kernel
-     * @deprecated To be removed along with ClTemplateWriter
-     */
-    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors()
-    {
-        return {};
-    }
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get the flat list of arguments of the kernel*/
     virtual GpuKernelArgumentList get_kernel_arguments()
     {
         return {};
     }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index b80ce0d816..f8770920b7 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -24,15 +24,12 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
 
-#include "ckw/Kernel.h"
-
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
 #include "compute_kernel_writer/include/ckw/Kernel.h"
 #include "compute_kernel_writer/include/ckw/KernelArgument.h"
-#include <map>
 #include <string>
 
 namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
index f1f0e6747b..c9ce7eb269 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
 
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
@@ -33,8 +33,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
- */
 class GpuCkwStore : public IGpuCkwComponentDriver
 {
 public:
@@ -61,4 +59,4 @@ private:
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index 4b8eea2f57..6678c929e9 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
@@ -100,10 +100,6 @@ public:
         return _properties;
     }
     /** Get writer for the component */
-    virtual const IGpuTemplateComponentWriter *template_writer() const
-    {
-        return nullptr;
-    }
     virtual const IGpuCkwComponentDriver *ckw_component_driver() const
     {
         return nullptr;
@@ -119,4 +115,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index fdf528a65d..e316bdf46d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,7 @@
 #include "ClComponentActivation.h"
 
 #include "src/core/CL/CLValidate.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -69,11 +65,7 @@ ClComponentActivation::ClComponentActivation(ComponentId
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateActivation>(id, tensors, attributes)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -81,11 +73,7 @@ ClComponentActivation::~ClComponentActivation()
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentActivation::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index 02c854356a..b8185158f3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
@@ -41,11 +41,7 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateActivation;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwActivation;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentActivation final : public IGpuKernelComponent
 {
@@ -106,11 +102,7 @@ public:
     ClComponentActivation &operator=(ClComponentActivation &&component) = default;
 
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver     *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
 
     /** Get component type */
     GpuComponentType type() const override
@@ -119,13 +111,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateActivation> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwActivation> _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index b1636795a3..e1850d78c4 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,7 @@
 
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -72,22 +68,16 @@ ClComponentCast::ClComponentCast(ComponentId                      id,
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateCast>(id, tensors, attributes)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
 }
+
 ClComponentCast::~ClComponentCast()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentCast::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+
 const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index ed77b1203b..201dacc288 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 
@@ -49,11 +49,7 @@ private:
 };
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateCast;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwCast;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentCast final : public IGpuKernelComponent
 {
@@ -120,11 +116,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentCast &operator=(ClComponentCast &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -132,14 +124,10 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateCast> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<GpuCkwCast>   _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwCast> _component_writer;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
index ca8037c393..7cd23d6115 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,11 +28,7 @@
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -212,22 +208,14 @@ ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId
                                                        const Attributes                &attributes,
                                                        const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwDepthwiseConv2d>(id, tensors, attributes, settings)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
 }
 ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentDepthwiseConv2d::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDepthwiseConv2d::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
index 01168e9ded..7526361f1c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,11 +44,7 @@ class ArgumentPack;
 class DepthwiseConv2dAttributes;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateDepthwiseConv2d;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwDepthwiseConv2d;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 /** Component specific settings
  */
@@ -161,13 +157,8 @@ public:
     ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default;
-    /** Get template writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver          *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
-
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -175,11 +166,7 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateDepthwiseConv2d> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwDepthwiseConv2d> _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index 98f3d6a882..783a17df30 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,12 +28,7 @@
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
-
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -153,11 +148,7 @@ ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId
                                                  const Attributes                &attributes,
                                                  const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings)}
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -165,11 +156,7 @@ ClComponentDirectConv2d::~ClComponentDirectConv2d()
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index d6d9705d3c..c50b0fa0ce 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -68,11 +68,7 @@ private:
 };
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateDirectConv2d;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwDirectConv2d;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentDirectConv2d final : public IGpuKernelComponent
 {
@@ -139,11 +135,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver       *ckw_component_driver() const override;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -151,13 +143,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 5b136427e4..209c73dbee 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,7 @@
 #include "arm_compute/core/Validate.h"
 
 #include "src/core/CL/CLValidate.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -117,19 +113,11 @@ ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId
                                                            const ArgumentPack<ITensorInfo> &tensors,
                                                            const Attributes                &attributes)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index 7589b9732c..a4395a6219 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -40,11 +40,7 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateElementwiseBinary;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwElementwiseBinary;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentElementwiseBinary final : public IGpuKernelComponent
 {
@@ -105,12 +101,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver            *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
-
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -118,13 +109,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateElementwiseBinary> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwElementwiseBinary> _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
deleted file mode 100644
index 27c13bd654..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties                &properties,
-                                                   const ArgumentPack<ITensorInfo> &tensors,
-                                                   const Attributes                &attributes)
-{
-    ARM_COMPUTE_UNUSED(properties, attributes);
-
-    const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensorInfo *sum = tensors.get_const_tensor(TensorType::ACL_DST_0);
-    const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_1);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-
-    // 1. Check validity
-    // All tensor infos are initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(sum->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-
-    // Check for mismatches in shapes and data types
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    // Device requirements are met
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // 2. Check support level
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    return Status{};
-}
-
-ClComponentLogits1DMaxShiftExpSum::ClComponentLogits1DMaxShiftExpSum(ComponentId                      id,
-                                                                     const Properties                &properties,
-                                                                     const ArgumentPack<ITensorInfo> &tensors,
-                                                                     const Attributes                &attributes)
-    : IGpuKernelComponent{id, properties, tensors},
-      _component_writer{std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes)}
-{
-}
-
-ClComponentLogits1DMaxShiftExpSum::~ClComponentLogits1DMaxShiftExpSum()
-{
-}
-
-const IGpuTemplateComponentWriter *ClComponentLogits1DMaxShiftExpSum::template_writer() const
-{
-    return _component_writer.get();
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
deleted file mode 100644
index 91ab5de3b5..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
-
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
-namespace arm_compute
-{
-/** Forward declaration */
-class ITensorInfo;
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Forward declaration */
-template <typename T>
-class ArgumentPack;
-
-/** Forward declaration */
-class ClTemplateLogits1DMaxShiftExpSum;
-
-/** Component to calculate max-shifted exponentials and their sum
- *
- *  1D example:
- *      input:  [x1, x2, ... , xn], shape: (1 x d)
- *
- *      Let max(x1...xn) = m
- *
- *      (output) sum: [exp(x1-m) + ... + exp(xn-m)], shape: (1 x 1)
- *      (output) dst: [exp(x1-m) ... exp(xn-m)], shape: (1 x d)
- *
- *  This component is used by the softmax operator. The subsequent
- *  operation normalizes dst with sum, therefore the max-shifting
- *  since exp(m) will be cancelled in numerator and denominator.
-*/
-class ClComponentLogits1DMaxShiftExpSum final : public IGpuKernelComponent
-{
-public:
-    /** Attributes are a set of backend-agnostic parameters that define what a component does */
-    using Attributes = SoftmaxAttributes;
-
-    /** Validate the component
-     *
-     * @param[in] properties Component properties @ref Properties
-     * @param[in] tensors    Tensor arguments to the component
-     * @param[in] attributes Component attributes @ref Attributes
-     *
-     * @return Status        Validation results
-     *
-     * Tensor argument names:
-     * - ACL_SRC_0: Input
-     * - ACL_DST_0: Output
-     * - ACL_DST_1: Output
-     *
-     * Tensor argument constness:
-     * - ACL_SRC_0: Const
-     * - ACL_DST_0: Const
-     * - ACL_DST_1: Const
-     *
-     * Valid data layouts:
-     * - All
-     *
-     ** Valid data type configurations:
-     * |ACL_SRC_0  |ACL_DST_0  |ACL_DST_1  |
-     * |:----------|:----------|:----------|
-     * |F16        | F16       | F16       |
-     * |F32        | F32       | F32       |
-     */
-    static Status
-    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Constructor
-     *
-     * Similar to @ref ClComponentLogits1DMaxShiftExpSum::validate()
-     */
-    ClComponentLogits1DMaxShiftExpSum(ComponentId                      id,
-                                      const Properties                &properties,
-                                      const ArgumentPack<ITensorInfo> &tensors,
-                                      const Attributes                &attributes);
-
-    /** Destructor */
-    ~ClComponentLogits1DMaxShiftExpSum() override;
-    /** Prevent instances of this class from being copy constructed */
-    ClComponentLogits1DMaxShiftExpSum(const ClComponentLogits1DMaxShiftExpSum &component) = delete;
-    /** Prevent instances of this class from being copied */
-    ClComponentLogits1DMaxShiftExpSum &operator=(const ClComponentLogits1DMaxShiftExpSum &component) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClComponentLogits1DMaxShiftExpSum(ClComponentLogits1DMaxShiftExpSum &&component) = default;
-    /** Allow instances of this class to be moved */
-    ClComponentLogits1DMaxShiftExpSum &operator=(ClComponentLogits1DMaxShiftExpSum &&component) = default;
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
-    /** Get component type */
-    GpuComponentType type() const override
-    {
-        return GpuComponentType::Unfusable;
-    }
-
-private:
-    std::unique_ptr<ClTemplateLogits1DMaxShiftExpSum> _component_writer;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
deleted file mode 100644
index fb2544385c..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-Status ClComponentLogits1DNorm::validate(const Properties                &properties,
-                                         const ArgumentPack<ITensorInfo> &tensors,
-                                         const Attributes                &attributes)
-{
-    ARM_COMPUTE_UNUSED(properties, attributes);
-
-    const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensorInfo *sum = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-
-    // 1. Check validity
-    // All tensor infos are initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(sum->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-
-    // Check for mismatches in shapes and data types
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.is_log_softmax() && !is_data_type_float(src->data_type()));
-
-    // Device requirements are met
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // 2. Check support level
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    return Status{};
-}
-
-ClComponentLogits1DNorm::ClComponentLogits1DNorm(ComponentId                      id,
-                                                 const Properties                &properties,
-                                                 const ArgumentPack<ITensorInfo> &tensors,
-                                                 const Attributes                &attributes)
-    : IGpuKernelComponent{id, properties, tensors},
-      _component_writer{std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes)}
-{
-}
-
-ClComponentLogits1DNorm::~ClComponentLogits1DNorm()
-{
-}
-
-const IGpuTemplateComponentWriter *ClComponentLogits1DNorm::template_writer() const
-{
-    return _component_writer.get();
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
deleted file mode 100644
index 74c0273604..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
-
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
-namespace arm_compute
-{
-/** Forward declaration */
-class ITensorInfo;
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Forward declaration */
-template <typename T>
-class ArgumentPack;
-
-/** Forward declaration */
-class ClTemplateLogits1DNorm;
-
-/** Component to calculate the final step of the Softmax Layer
- * where each logit value is multiplied by the inverse of the sum of the logits.
- *
- *  1D example:
- *
- *      (input)  src: [x1 x2 ... xn], shape: (1 x d)
- *      (input)  sum: [x1 + x2 + ... + xn], shape: (1 x 1)
- *      (output) dst: [x1/sum x2/sum ... xn/sum], shape: (1 x d)
- *
- *  This component is used by the softmax operator to get the final result.
-*/
-class ClComponentLogits1DNorm final : public IGpuKernelComponent
-{
-public:
-    /** Attributes are a set of backend-agnostic parameters that define what a component does */
-    using Attributes = SoftmaxAttributes;
-
-    /** Validate the component
-     *
-     * @param[in] properties Component properties @ref Properties
-     * @param[in] tensors    Tensor arguments to the component
-     * @param[in] attributes Component attributes @ref Attributes
-     *
-     * @return Status        Validation results
-     *
-     * Tensor argument names:
-     * - ACL_SRC_0: Input
-     * - ACL_SRC_1: Input
-     * - ACL_DST_0: Output
-     *
-     * Tensor argument constness:
-     * - ACL_SRC_0: Const
-     * - ACL_SRC_1: Const
-     * - ACL_DST_0: Const
-     *
-     * Valid data layouts:
-     * - All
-     *
-     ** Valid data type configurations:
-     * |ACL_SRC_0  |ACL_SRC_1  |ACL_DST_0  |
-     * |:----------|:----------|:----------|
-     * |F16        | F16       | F16       |
-     * |F32        | F32       | F32       |
-     */
-    static Status
-    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Constructor
-     *
-     * Similar to @ref ClComponentLogits1DNorm::validate()
-     */
-    ClComponentLogits1DNorm(ComponentId                      id,
-                            const Properties                &properties,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
-
-    /** Destructor */
-    ~ClComponentLogits1DNorm() override;
-    /** Prevent instances of this class from being copy constructed */
-    ClComponentLogits1DNorm(const ClComponentLogits1DNorm &component) = delete;
-    /** Prevent instances of this class from being copied */
-    ClComponentLogits1DNorm &operator=(const ClComponentLogits1DNorm &component) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClComponentLogits1DNorm(ClComponentLogits1DNorm &&component) = default;
-    /** Allow instances of this class to be moved */
-    ClComponentLogits1DNorm &operator=(ClComponentLogits1DNorm &&component) = default;
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
-    /** Get component type */
-    GpuComponentType type() const override
-    {
-        return GpuComponentType::Unfusable;
-    }
-
-private:
-    std::unique_ptr<ClTemplateLogits1DNorm> _component_writer;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
index f238d42d98..53ac8da41f 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
 
@@ -147,5 +146,3 @@ const IGpuCkwComponentDriver *ClComponentMatMul::ckw_component_driver() const
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index 5544963b3f..6e7243dc04 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -30,7 +30,6 @@
 
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 #include <memory>
@@ -93,27 +92,16 @@ ClComponentPool2d::ClComponentPool2d(ComponentId                      id,
                                      const Attributes                &attributes,
                                      const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 ClComponentPool2d::~ClComponentPool2d()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentPool2d::template_writer() const
-{
-    return _component_writer.get();
-}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentPool2d::ckw_component_driver() const
 {
     return _component_writer.get();
 }
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
index 98fed65004..d33e601f18 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,11 +42,7 @@ class ArgumentPack;
 class Pool2dAttributes;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplatePool2d;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwPool2d;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentPool2d final : public IGpuKernelComponent
 {
@@ -116,13 +112,9 @@ public:
 
     /** Allow instances of this class to be moved */
     ClComponentPool2d &operator=(ClComponentPool2d &&component) = default;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+
     /** Get GPU kernel writer for the component */
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -131,11 +123,7 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplatePool2d> _component_writer;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwPool2d> _component_writer;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
index 0ece9de970..dce85c424e 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Validate.h"
 
 #include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h"
 
 namespace arm_compute
 {
@@ -54,15 +53,16 @@ Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors)
 ClComponentReshape::ClComponentReshape(ComponentId                      id,
                                        const Properties                &properties,
                                        const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<ClTemplateReshape>(id, tensors)}
+    : IGpuKernelComponent{id, properties, tensors}
 {
 }
 ClComponentReshape::~ClComponentReshape()
 {
 }
-const IGpuTemplateComponentWriter *ClComponentReshape::template_writer() const
+const IGpuCkwComponentDriver *ClComponentReshape::ckw_component_driver() const
 {
-    return _component_writer.get();
+    /* NOT IMPLEMENTED */
+    return nullptr;
 }
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
index 78163d6603..fd0f966da1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
@@ -85,8 +85,8 @@ public:
     ClComponentReshape(ClComponentReshape &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentReshape &operator=(ClComponentReshape &&component) = default;
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -94,10 +94,9 @@ public:
     }
 
 private:
-    std::unique_ptr<ClTemplateReshape> _component_writer;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
index b05eb04698..411eeca802 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,12 +29,7 @@
 #include "src/core/CL/CLValidate.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h"
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -43,11 +38,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateResize;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwResize;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 Status ClComponentResize::validate(const IGpuKernelComponent::Properties &properties,
                                    const ArgumentPack<ITensorInfo>       &tensors,
@@ -82,11 +73,7 @@ ClComponentResize::ClComponentResize(ComponentId                            id,
                                      const ArgumentPack<ITensorInfo>       &tensors,
                                      const ClComponentResize::Attributes   &attributes)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateResize>(id, tensors, attributes)}
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -94,11 +81,7 @@ ClComponentResize::~ClComponentResize()
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
index 29276c3257..9a1169c45f 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
 
@@ -42,11 +42,7 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateResize;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwResize;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentResize final : public IGpuKernelComponent
 {
@@ -111,11 +107,7 @@ public:
     ClComponentResize &operator=(ClComponentResize &&component) = default;
 
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -124,15 +116,11 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateResize> _component_writer;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwResize> _component_writer;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index dcbecaff35..3db6c5cd2d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,7 @@
 #include "ClComponentStore.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include <memory>
 
@@ -46,22 +42,13 @@ Status ClComponentStore::validate(const Properties &properties, const ArgumentPa
 ClComponentStore::ClComponentStore(ComponentId                      id,
                                    const Properties                &properties,
                                    const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateStore>(id, tensors)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
 {
 }
 ClComponentStore::~ClComponentStore()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index 948785c480..2c1dd0f6fc 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
@@ -39,11 +39,7 @@ namespace dynamic_fusion
 /** Forward declaration */
 template <typename T>
 class ArgumentPack;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateStore;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwStore;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentStore final : public IGpuKernelComponent
 {
@@ -88,11 +84,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentStore &operator=(ClComponentStore &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -100,13 +92,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateStore> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<GpuCkwStore>  _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwStore> _component_writer;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
index 697b7d4e1f..4d6e7f81bb 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
 namespace arm_compute
 {
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
index e24629a036..2997b28ec1 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
 
@@ -244,4 +243,3 @@ ITensorInfo *GpuMatMul::create_op(GpuWorkloadSketch &sketch,
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
index 431c9110fc..d385752201 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
@@ -28,8 +28,6 @@
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
@@ -88,9 +86,8 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
         arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
         arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(
-            ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes));
+        ARM_COMPUTE_UNUSED(properties, attributes);
+        return Status(ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not implemented");
     }
     else
     {
@@ -177,8 +174,8 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorI
             arguments_norm.add_const_tensor(ACL_SRC_1, sum);
             arguments_norm.add_const_tensor(ACL_DST_0, dst);
 
-            comp_graph.add_new_component<ClComponentLogits1DMaxShiftExpSum>(properties, arguments_exp_sum, attributes);
-            comp_graph.add_new_component<ClComponentLogits1DNorm>(properties, arguments_norm, attributes);
+            // Add to component graph -- NOT IMPLEMENTED
+            ARM_COMPUTE_UNUSED(comp_graph, attributes);
         }
     }
     else
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
index bf0f274c5c..b9d01966b3 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,6 @@
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
 namespace arm_compute
 {
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
deleted file mode 100644
index 775b0a0c8c..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "GpuKernelVariableTable.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
-                                              const ITensorInfo             *tensor,
-                                              GpuKernelArgumentInfo          argument_info,
-                                              const std::string             &alias)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
-
-    // Do not re-declare if the variable associated with the tensor has already been declared
-    auto it = _vars.find(tensor->id());
-
-    if (it != _vars.end())
-    {
-        ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info));
-        return;
-    }
-
-    const auto target = comp_group.get_tile_for_tensor(tensor);
-
-    if (target != tensor)
-    {
-        // If the tensor uses a shared tile, don't declare another variable.
-        it = _vars.find(target->id());
-
-        ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared.");
-
-        _vars[tensor->id()] = it->second;
-    }
-    else
-    {
-        // Declare variable associated with the tensor
-        std::stringstream ss;
-        ss << alias << "_t" << abs(tensor->id());
-        const auto     uniq_name = ss.str();
-        TensorVariable var{tensor->id(), uniq_name, argument_info};
-
-        _vars.emplace(tensor->id(), var);
-    }
-}
-
-GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
-{
-    const auto var = _vars.at(tensor->id());
-    return var;
-}
-
-GpuKernelVariableTable::VariableList
-GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
-{
-    VariableList vars{};
-    for (const auto &tensor : tensors)
-    {
-        if (!tensor->has_valid_id())
-        {
-            continue;
-        }
-        vars.push_back(get_variable(tensor));
-    }
-    return vars;
-}
-
-TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name}
-{
-}
-
-TagVal::TagVal(const std::string &val) : value{val}
-{
-}
-
-TagVal::TagVal(const char *val) : value{std::string(val)}
-{
-}
-
-TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)}
-{
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
deleted file mode 100644
index c17f131ada..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
-
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "support/AclRequires.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class GpuKernelComponentGroup;
-
-/** A table of all the variables used in the kernel.
- * Each kernel has exactly one variable table.
- */
-class GpuKernelVariableTable
-{
-public:
-    /** A tensor variable whose main purposes are:
-     *  - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
-     *  - Hold the generated variable name for the associated tensor info
-     */
-    struct TensorVariable
-    {
-    public:
-        TensorVariable()                                        = default;
-        TensorVariable(const TensorVariable &)                  = default;
-        TensorVariable       &operator=(const TensorVariable &) = default;
-        ITensorInfo::Id       id{ITensorInfo::invalid_tensor_id};
-        std::string           uniq_name{"empty"}; // Unique name, also the final variable name used in the built code
-        GpuKernelArgumentInfo kernel_argument_info{};
-        bool                  has_valid_id() const
-        {
-            return id != ITensorInfo::invalid_tensor_id;
-        }
-    };
-    using VariableList = std::vector<TensorVariable>;
-
-public:
-    /** Declare a @ref TensorVariable for a corresponding tensor info.
-     *
-     * @param[in] comp_group    Component group the tensor belongs to
-     * @param[in] tensor        Tensor info with which the new variable is associated
-     * @param[in] argument_info Kernel argument information
-     * @param[in] alias         Alias for the variable. Will be used as part of the variable name
-     */
-    void declare_variable(const GpuKernelComponentGroup &comp_group,
-                          const ITensorInfo             *tensor,
-                          GpuKernelArgumentInfo          argument_info,
-                          const std::string             &alias = "unnamed");
-    /** Get the @ref TensorVariable associated with @p tensor
-     *
-     * @param[in] tensor Tensor info to be queried
-     *
-     * @return TensorVariable
-     */
-    TensorVariable get_variable(const ITensorInfo *tensor) const;
-    /** Get the @ref TensorVariable list associated with @p tensors
-     * @note Empty tensors are skipped
-     *
-     * @param[in] tensors List of tensor infos to be queried
-     *
-     * @return VariableList
-     */
-    VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
-
-private:
-    std::map<ITensorInfo::Id, TensorVariable> _vars{};
-};
-
-/** A tag value will substitute a tag in a string template during its instantiation */
-struct TagVal
-{
-    /** Default constructor */
-    TagVal() = default;
-    /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
-    TagVal(const GpuKernelVariableTable::TensorVariable &var);
-    /** Construct a @ref TagVal from an integral type */
-    template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
-    TagVal(T val) : value{support::cpp11::to_string(val)}
-    {
-    }
-    /** Construct a @ref TagVal from a string */
-    TagVal(const std::string &val);
-    /** Construct a @ref TagVal from a c-style string */
-    TagVal(const char *val);
-    /** Construct a @ref TagVal from a @ref DataType */
-    TagVal(const DataType &data_type);
-    /** Get the value of the TagVal as a converted string */
-    std::string value{};
-};
-
-/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
-using Tag = std::string;
-
-/** Tag lookup table. It is used to instantiate a string template */
-using TagLUT = std::unordered_map<Tag, TagVal>;
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
deleted file mode 100644
index 9d0b4f592a..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Forward declaration */
-class GpuKernelComponentGroup;
-class GpuKernelVariableTable;
-
-/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
- */
-class IGpuTemplateComponentWriter
-{
-public:
-    using ComponentGroup = GpuKernelComponentGroup;
-
-    /**For now all kernel intermeditate/destination tensors are expected to be of type Tensor_4D_t_Buffer*/
-    static constexpr GpuKernelArgumentInfo::Type common_tensor_type = GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-public:
-    /** Constructor
-     *
-     * @param[in] id      Component id
-     * @param[in] tensors Tensor arguments to the components
-     */
-    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
-    {
-    }
-    /** Destructor */
-    virtual ~IGpuTemplateComponentWriter()
-    {
-    }
-    /** Generate kernel component name */
-    virtual std::string get_name() const = 0;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
-    /** Generate additional macros used in the component */
-    virtual std::string get_additional_macros() const
-    {
-        return "";
-    }
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
-    {
-        ARM_COMPUTE_UNUSED(comp_group);
-        return CLBuildOptions{};
-    }
-    /** Generate the component config id string used for tuning */
-    virtual std::string get_config_id() const
-    {
-        return "";
-    }
-    /** Generate the header list used in the component */
-    virtual std::set<std::string> get_headers_list() const
-    {
-        return std::set<std::string>{};
-    }
-    /** Generate the execution window for the component */
-    virtual Window get_window() const
-    {
-        return Window{};
-    }
-    /** Get tensor arguments */
-    ArgumentPack<ITensorInfo> tensors() const
-    {
-        return _tensors;
-    }
-    /** Get component id */
-    ComponentId id() const
-    {
-        return _id;
-    }
-
-private:
-    ComponentId               _id{-1};
-    ArgumentPack<ITensorInfo> _tensors{};
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
deleted file mode 100644
index c165fb5f33..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateActivation.h"
-
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateActivation::ClTemplateActivation(ComponentId                      id,
-                                           const ArgumentPack<ITensorInfo> &tensors,
-                                           const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateActivation::get_name() const
-{
-    return "activation";
-}
-
-std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_group) const
-{
-    std::string code;
-    const bool  is_root = (comp_group.get_root_component()->id() == this->id());
-
-    code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    if (is_root)
-    {
-        code += R"_(
-// IN(src)              {{src}}
-// OUT(dst, accum)      {{dst}}
-
-TILE({{DATA_TYPE}}, M0, N0, {{src}});
-TILE(uint, M0, 1, g_dst_indirect_y);
-{
-    {{src}}_offset_first_element_in_bytes += g_ind_2 * {{src}}_stride_z;
-
-    T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{src}});
-
-    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
-}
-
-LOOP_UNROLLING(int, i, 0, 1, M0,
-{
-    g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-    g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-    g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-})
-)_";
-    }
-    else
-    {
-        code += R"_(
-// IN/OUT(src, accum)   {{src}}
-
-{
-    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
-}
-)_";
-    }
-    code += R"_(
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    return code;
-}
-
-void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["DATA_TYPE"]      = get_cl_type_from_data_type(_src->data_type());
-    lut["TENSOR_TYPE"]    = "BUFFER";
-
-    const auto f_act = lower_string(string_from_activation_func(_attributes.activation()));
-
-    lut["ACT"]   = f_act;
-    lut["A_VAL"] = float_to_string_with_full_precision(_attributes.a());
-    lut["B_VAL"] = float_to_string_with_full_precision(_attributes.b());
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateActivation::get_build_options(const ComponentGroup &comp_group) const
-{
-    /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateActivation::get_config_id() const
-{
-    std::string config_id{};
-    config_id += "activation_";
-    config_id += lower_string(string_from_data_type(_src->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(1));
-    return config_id;
-}
-
-std::set<std::string> ClTemplateActivation::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"};
-}
-
-Window ClTemplateActivation::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    const unsigned int n0  = adjust_vec_size(16 / _dst->element_size(), _dst->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
deleted file mode 100644
index 88ee370342..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEACTIVATION
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEACTIVATION
-
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/function_info/ActivationLayerInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateActivation final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentActivation::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Destructor */
-    ~ClTemplateActivation() override = default;
-
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateActivation(const ClTemplateActivation &activation) = delete;
-
-    /** Prevent instances of this class from being copied */
-    ClTemplateActivation &operator=(const ClTemplateActivation &activation) = delete;
-
-    /** Allow instances of this class to be move constructed */
-    ClTemplateActivation(ClTemplateActivation &&activation) = default;
-
-    /** Allow instances of this class to be moved */
-    ClTemplateActivation &operator=(ClTemplateActivation &&activation) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEACTIVATION */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
deleted file mode 100644
index 0da3a73801..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateCast.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateCast::get_name() const
-{
-    const size_t src_size = data_size_from_type(_src->data_type());
-    const size_t dst_size = data_size_from_type(_dst->data_type());
-
-    return (src_size >= dst_size) ? "cast_down" : "cast_up";
-}
-
-std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    const std::string kernel_name = get_name();
-    const auto        is_root     = (comp_group.get_root_component()->id() == this->id());
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
-)_";
-
-    if (is_root)
-    {
-        code += R"_(
-// IN_0(src)            {{src}}
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-{
-    {{src}}_offset_first_element_in_bytes += get_global_id(2) * {{src}}_stride_z;
-
-    TILE({{DATA_TYPE_IN}}, M0, N0, {{tmp}});
-    T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{tmp}});
-)_";
-    }
-
-    code += R"_(
-    LOOP_UNROLLING(int, m0, 0, 1, M0,
-    {
-)_";
-
-    if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
-    {
-        code += R"_(
-    {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
-)_";
-    }
-
-    if (kernel_name == "cast_down" &&
-        (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
-    {
-        code += R"_(
-    {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
-)_";
-    }
-    else
-    {
-        code += R"_(
-    {{dst}}[m0].v = CONVERT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
-)_";
-    }
-
-    code += R"_(
-    })
-)_";
-
-    if (is_root)
-    {
-        code += R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-    })
-}
-)_";
-    }
-
-    code += R"_(
-//------------------ END KERNEL {{meta_kernel_id}} CAST ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    const auto is_root = (comp_group.get_root_component()->id() == this->id());
-
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-    lut["tmp"] = (is_root) ? lut["src"].value + "_in_data" : lut["src"];
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    lut["DATA_TYPE_IN"]  = get_cl_type_from_data_type(_src->data_type());
-    lut["DATA_TYPE_OUT"] = get_cl_type_from_data_type(_dst->data_type());
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateCast::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-
-    // Set build options
-    CLBuildOptions build_opts{};
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_src->dimension(0) % n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-
-    return build_opts;
-}
-
-std::string ClTemplateCast::get_config_id() const
-{
-    std::string config_id{};
-
-    config_id += "_";
-    config_id += lower_string(string_from_data_type(_src->data_type()));
-    config_id += "_";
-    config_id += lower_string(string_from_data_type(_dst->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(1));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateCast::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateCast::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    const unsigned int n0  = adjust_vec_size(16 / _dst->element_size(), _dst->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h
deleted file mode 100644
index 3adca4edc9..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATECAST
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATECAST
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateCast final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentCast::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateCast(const ClTemplateCast &cast) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateCast &operator=(const ClTemplateCast &cast) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateCast(ClTemplateCast &&cast) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateCast &operator=(ClTemplateCast &&cast) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATECAST */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
deleted file mode 100644
index 8380620ab2..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateDepthwiseConv2d.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId                      id,
-                                                     const ArgumentPack<ITensorInfo> &tensors,
-                                                     const Attributes                &attributes,
-                                                     const Settings                  &settings)
-    : IGpuTemplateComponentWriter{id, tensors},
-      _src{},
-      _weight{},
-      _bias{},
-      _dst{},
-      _attributes{attributes},
-      _settings{settings}
-{
-    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
-    {
-        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
-    }
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
-}
-
-std::string ClTemplateDepthwiseConv2d::get_name() const
-{
-    return "depthwise_conv2d";
-}
-
-std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    constexpr int height_idx = 2; // Data Layout is NHWC
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// IN_1(wei)            {{weight}}
-)_";
-
-    if (_bias != nullptr && _bias->has_valid_id())
-    {
-        code += R"_(
-// IN_1(bia)            {{bias}}
-)_";
-    }
-
-    code += R"_(
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-
-{
-#define _IWEI_WIDTH {{WEI_WIDTH}}
-#define _IWEI_HEIGHT {{WEI_HEIGHT}}
-#define _IDST_WIDTH {{arg_dst}}_w
-#define _IDST_HEIGHT {{arg_dst}}_h
-#define _IM0_A M0_A
-#define _IN0_A N0_A
-#define _IM0_B _IWEI_WIDTH
-#define _IN0_B N0
-#define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1)))
-)_";
-
-    code += R"_(
-    const int yo = g_ind_2 % {{arg_dst}}_h;
-    const int bout = g_ind_2 / {{arg_dst}}_h;
-)_";
-
-    code += R"_(
-
-    int xi = g_ind_1 * {{STRIDE_X}};
-    int yi = yo * {{STRIDE_Y}};
-    xi -= {{PAD_LEFT}};
-    yi -= {{PAD_TOP}};
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        {{dst}}[i].v = 0;
-    })
-)_";
-
-    if (_weight->dimension(height_idx) < 5)
-    {
-        code += R"_(
-    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-)_";
-    }
-    else
-    {
-        code += R"_(
-    for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
-)_";
-    }
-
-    code += R"_(
-    {
-        TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a);
-
-        LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-        {
-            a[i].v = 0;
-        })
-
-        T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a);
-
-        TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b);
-
-        T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b);
-
-        LOOP_UNROLLING(int, m0, 0, 1, M0,
-        {
-            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-            {
-)_";
-
-    if (!_settings.is_fma_available())
-    {
-        code += R"_(
-                {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
-)_";
-    }
-    else
-    {
-        code += R"_(
-                {{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v);
-)_";
-    }
-
-    code += R"_(
-            })
-        })
-    }
-)_";
-
-    if (_weight->dimension(height_idx) < 5)
-    {
-        code += R"_(
-    )
-)_";
-    }
-
-    if (_bias && _bias->has_valid_id())
-    {
-        code += R"_(
-        TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
-
-        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}});
-
-        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}});
-)_";
-    }
-
-    code += R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-    })
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable,
-                                                  const ComponentGroup   &comp_group) const
-{
-    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image()
-                                                       ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
-                                                       : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image()
-                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
-                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
-
-    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
-    {
-        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
-    }
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
-                                              const ComponentGroup         &comp_group) const
-{
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"]    = vtable.get_variable(_src);
-    lut["weight"] = vtable.get_variable(_weight);
-
-    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
-    {
-        lut["bias"]          = vtable.get_variable(_bias);
-        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
-    }
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["ACC_DATA_TYPE"]  = _src->data_type();
-    lut["SRC_DATA_TYPE"]  = _src->data_type();
-    lut["WEI_DATA_TYPE"]  = _weight->data_type();
-
-    switch (vtable.get_variable(_src).kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-            lut["SRC_TENSOR_TYPE"] = "IMAGE";
-            break;
-        default:
-            lut["SRC_TENSOR_TYPE"] = "BUFFER";
-            break;
-    }
-
-    switch (vtable.get_variable(_weight).kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-            lut["WEI_TENSOR_TYPE"] = "IMAGE";
-            break;
-        default:
-            lut["WEI_TENSOR_TYPE"] = "BUFFER";
-            break;
-    }
-
-    // Data Layout is NHWC
-    constexpr int width_idx  = 1;
-    constexpr int height_idx = 2;
-
-    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
-
-    lut["STRIDE_X"] = _attributes.stride().x();
-    lut["STRIDE_Y"] = _attributes.stride().y();
-
-    lut["PAD_LEFT"] = _attributes.pad().left;
-    lut["PAD_TOP"]  = _attributes.pad().top;
-
-    lut["DILATION_X"] = _attributes.dilation().x();
-    lut["DILATION_Y"] = _attributes.dilation().y();
-
-    lut["DEPTH_MULTIPLIER"] = _attributes.depth_multiplier();
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    constexpr unsigned int width_idx = 1; // Data Layout is NHWC
-
-    const unsigned int n0               = _settings.n0();
-    const unsigned int m0               = _settings.m0();
-    const unsigned int m0_a             = _weight->dimension(width_idx) + m0 - 1;
-    const unsigned int n0_a             = _attributes.depth_multiplier() > 1 ? 1 : n0;
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts{};
-
-    if (_settings.fast_relaxed_math())
-    {
-        build_opts.add_option("-cl-fast-relaxed-math");
-    }
-    else
-    {
-        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
-        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
-        build_opts.add_option("-cl-unsafe-math-optimizations");
-    }
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DN0_A=" + support::cpp11::to_string(n0_a));
-    build_opts.add_option("-DM0_A=" + support::cpp11::to_string(m0_a));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateDepthwiseConv2d::get_config_id() const
-{
-    std::string config_id{};
-
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(2));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(2));
-    config_id += "_";
-    config_id += string_from_data_type(_src->data_type());
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateDepthwiseConv2d::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    Window win = calculate_max_window(*_dst, Steps(_settings.n0(), _settings.m0()));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
deleted file mode 100644
index 5d04c687c3..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
-
-#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateDepthwiseConv2d final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentDepthwiseConv2d::Attributes;
-    using Settings   = ClComponentDepthwiseConv2d::Settings;
-    /** Constructor
-     *
-     * Similar to @ref ClComponentDepthwiseConv2d::validate()
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     * @param[in] settings   Component settings
-     */
-    ClTemplateDepthwiseConv2d(ComponentId                      id,
-                              const ArgumentPack<ITensorInfo> &tensors,
-                              const Attributes                &attributes,
-                              const Settings                  &settings);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateDepthwiseConv2d(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateDepthwiseConv2d &operator=(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateDepthwiseConv2d(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateDepthwiseConv2d &operator=(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_weight;
-    const ITensorInfo *_bias;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-    Settings           _settings;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
deleted file mode 100644
index f6a7a58d1d..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateDirectConv2d.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
-                                               const ArgumentPack<ITensorInfo> &tensors,
-                                               const Attributes                &attributes,
-                                               const Settings                  &settings)
-    : IGpuTemplateComponentWriter{id, tensors},
-      _src{},
-      _weight{},
-      _bias{},
-      _dst{},
-      _attributes{attributes},
-      _settings{settings}
-{
-    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
-    {
-        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
-    }
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
-}
-
-std::string ClTemplateDirectConv2d::get_name() const
-{
-    return "direct_conv2d";
-}
-
-std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
-    const auto k0            = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
-    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// IN_1(wei)            {{weight}}
-)_";
-    if (_bias && _bias->has_valid_id())
-    {
-        code += R"_(
-// IN_1(bia)            {{bias}}
-)_";
-    }
-    code += R"_(
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-
-{
-#define _IWEI_WIDTH {{WEI_WIDTH}}
-#define _IWEI_HEIGHT {{WEI_HEIGHT}}
-#define _ISRC_WIDTH {{SRC_WIDTH}}
-#define _ISRC_HEIGHT {{SRC_HEIGHT}}
-#define _ISRC_CHANNELS {{SRC_CHANNELS}}
-#define _IDST_WIDTH {{DST_WIDTH}}
-#define _IDST_HEIGHT {{DST_HEIGHT}}
-#define _IDST_CHANNELS {{DST_CHANNELS}}
-#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
-
-    TILE(int, M0, 1, xi);
-    TILE(int, M0, 1, yi);
-
-    // Convert the linear index to coordinate
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        xi[0].s[i] = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
-        yi[0].s[i] = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
-        xi[0].s[i] -= {{PAD_LEFT}};
-        yi[0].s[i] -= {{PAD_TOP}};
-    })
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        {{dst}}[i].v = 0;
-    })
-
-    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
-    {
-        int xk = i % _IWEI_WIDTH;
-        int yk = i / _IWEI_WIDTH;
-
-        TILE(int, 1, M0, my);
-
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            int x_s    = xi[0].s[i] + xk;
-            int y_s    = yi[0].s[i] + yk;
-            my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
-            my[0].s[i] = my[0].s[i] + g_ind_2 * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
-            my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
-            my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
-            my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
-            my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
-        })
-
-        int ck = 0;
-        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
-        {
-            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
-            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = {{ZERO_VALUE}};
-            })
-
-            LOOP_UNROLLING(int, i, 0, 1, N0,
-            {
-                b[i].v = {{ZERO_VALUE}};
-            })
-
-            T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
-
-            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
-
-            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
-        }
-)_";
-
-    if (leftover_loop)
-    {
-        code += R"_(
-        for(; ck < _ISRC_CHANNELS; ++ck)
-        {
-            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
-            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = {{ZERO_VALUE}};
-            })
-
-            LOOP_UNROLLING(int, i, 0, 1, N0,
-            {
-                b[i].v = {{ZERO_VALUE}};
-            })
-
-            T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
-
-            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
-
-            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
-        }
-    )_";
-    }
-
-    code += R"_(
-#undef _I_WEI_WIDTH
-#undef _I_WEI_HEIGHT
-#undef _ISRC_WIDTH
-#undef _ISRC_HEIGHT
-#undef _ISRC_CHANNELS
-#undef _IDST_WIDTH
-#undef _IDST_HEIGHT
-#undef _IDST_CHANNELS
-#undef _IY_MULTIPLIER
-
-    }
-)_";
-
-    if (_bias && _bias->has_valid_id())
-    {
-        code += R"_(
-        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
-
-        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
-
-        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
-    )_";
-    }
-
-    code += R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
-        g_dst_indirect_y[i].v += g_ind_2 * (int)({{DST_WIDTH}} * {{DST_HEIGHT}});
-    })
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    return code;
-}
-
-void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image()
-                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
-                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
-
-    if (_bias && _bias->has_valid_id()) // optional bias
-    {
-        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
-    }
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
-}
-
-TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    TagLUT lut{};
-    // Arguments and global shared variables
-    lut["src"]    = vtable.get_variable(_src);
-    lut["weight"] = vtable.get_variable(_weight);
-
-    if (_bias && _bias->has_valid_id()) // optional bias
-    {
-        lut["bias"]          = vtable.get_variable(_bias);
-        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
-    }
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["ACC_DATA_TYPE"]  = _src->data_type();
-    lut["SRC_DATA_TYPE"]  = _src->data_type();
-    lut["WEI_DATA_TYPE"]  = _weight->data_type();
-
-    lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    switch (vtable.get_variable(_weight).kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-        {
-            lut["WEI_TENSOR_TYPE"] = "IMAGE";
-            break;
-        }
-        default:
-        {
-            lut["WEI_TENSOR_TYPE"] = "BUFFER";
-            break;
-        }
-    }
-    const auto width_idx   = 1;
-    const auto height_idx  = 2;
-    const auto channel_idx = 0;
-
-    lut["SRC_WIDTH"]    = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"]   = _src->dimension(height_idx);
-    lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
-
-    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
-
-    lut["DST_WIDTH"]    = _dst->dimension(width_idx);
-    lut["DST_HEIGHT"]   = _dst->dimension(height_idx);
-    lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
-
-    lut["STRIDE_X"] = _attributes.stride().x();
-    lut["STRIDE_Y"] = _attributes.stride().y();
-
-    lut["PAD_LEFT"] = _attributes.pad().left;
-    lut["PAD_TOP"]  = _attributes.pad().top;
-
-    lut["ZERO_VALUE"] = 0;
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
-{
-    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
-
-    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-    const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts{};
-    if (_settings.fast_relaxed_math())
-    {
-        build_opts.add_option("-cl-fast-relaxed-math");
-    }
-    else
-    {
-        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
-        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
-        build_opts.add_option("-cl-unsafe-math-optimizations");
-    }
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateDirectConv2d::get_config_id() const
-{
-    const DataType   data_type   = _src->data_type();
-    const DataLayout data_layout = _src->data_layout();
-
-    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    const unsigned int kernel_size = _weight->dimension(width_idx);
-
-    std::string config_id{};
-    config_id += lower_string(string_from_data_type(data_type));
-    config_id += "_";
-    config_id += support::cpp11::to_string(kernel_size);
-    config_id += "_";
-    config_id += support::cpp11::to_string(_attributes.stride().x());
-    config_id += "_";
-    config_id += support::cpp11::to_string(_attributes.stride().y());
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
-    config_id += "_";
-    config_id += lower_string(string_from_data_layout(data_layout));
-    return config_id;
-}
-
-std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateDirectConv2d::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    const auto output_shape = _dst->tensor_shape();
-    const auto desc         = _settings.direct_conv_descriptor();
-
-    const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
-    const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1] * output_shape[2]);
-
-    // Create and configure kernel window
-    Window win = calculate_max_window(output_shape, Steps(n0, m0));
-
-    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], m0);
-    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
-    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
-
-    return win;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
deleted file mode 100644
index 03c8cd2f15..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
-
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentDirectConv2d::Attributes;
-    using Settings   = ClComponentDirectConv2d::Settings;
-    /** Constructor
-     *
-     * Similar to @ref ClComponentDirectConv2d::validate()
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     * @param[in] settings   Component settings
-     */
-    ClTemplateDirectConv2d(ComponentId                      id,
-                           const ArgumentPack<ITensorInfo> &tensors,
-                           const Attributes                &attributes,
-                           const Settings                  &settings);
-    /** Destructor */
-    ~ClTemplateDirectConv2d() override = default;
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_weight;
-    const ITensorInfo *_bias;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-    Settings           _settings;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
deleted file mode 100644
index 78bff3c3f3..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateElementwiseBinary.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-constexpr unsigned int vector_size_byte_opencl = 16;
-
-ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId                      id,
-                                                         const ArgumentPack<ITensorInfo> &tensors,
-                                                         const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
-{
-    _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
-}
-
-std::string ClTemplateElementwiseBinary::get_name() const
-{
-    return "elementwise_binary";
-}
-
-std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const
-{
-    std::string code;
-    const bool  is_root      = (comp_group.get_root_component()->id() == this->id());
-    const bool  is_lhs_input = comp_group.is_input_tensor(_lhs);
-    const bool  is_rhs_input = comp_group.is_input_tensor(_rhs);
-
-    code =
-        R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
-)_";
-
-    if (is_root)
-    {
-        code +=
-            R"_(
-    TILE(uint, M0, 1, g_dst_indirect_y);
-)_";
-    }
-
-    if (is_lhs_input)
-    {
-        code +=
-            R"_(
-    TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}});
-)_";
-    }
-
-    if (is_rhs_input)
-    {
-        code +=
-            R"_(
-    TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}});
-)_";
-    }
-
-    code +=
-        R"_(
-    {
-)_";
-
-    if (is_lhs_input)
-    {
-        code +=
-            R"_(
-        {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w;
-        T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
-)_";
-    }
-
-    if (is_rhs_input)
-    {
-        code +=
-            R"_(
-        {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w;
-        T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
-)_";
-    }
-
-    code +=
-        R"_(
-        T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
-)_";
-
-    if (is_root)
-    {
-        // Calculate the destination indirect Y
-        code +=
-            R"_(
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
-            g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-        })
-)_";
-    }
-
-    code +=
-        R"_(
-    }
-    //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable,
-                                                    const ComponentGroup   &comp_group) const
-{
-    vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs");
-
-    vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
-}
-
-TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable,
-                                                const ComponentGroup         &comp_group) const
-{
-    TagLUT lut{};
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["DATA_TYPE"]      = get_cl_type_from_data_type(_lhs->data_type());
-    // Arguments and global shared variables
-
-    lut["lhs"]     = vtable.get_variable(_lhs);
-    lut["rhs"]     = vtable.get_variable(_rhs);
-    lut["dst"]     = vtable.get_variable(_dst);
-    lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
-
-    switch (_attributes.operation())
-    {
-        case Attributes::ElementwiseOp::Add:
-            lut["ELTWISE_OP"] = "ADD";
-            break;
-        case Attributes::ElementwiseOp::Sub:
-            lut["ELTWISE_OP"] = "SUB";
-            break;
-        case Attributes::ElementwiseOp::Mul:
-            lut["ELTWISE_OP"] = "MUL";
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
-    }
-
-    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) &&
-                         detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
-    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) &&
-                         detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
-
-    // Set broadcast parameters
-    // PRE: All tensors are broadcast-compatible
-    const auto &lhs_dims = _lhs->tensor_shape();
-    const auto &rhs_dims = _rhs->tensor_shape();
-    const auto &dst_dims = _dst->tensor_shape();
-
-    const auto lhs_broadcast_x = dst_dims[0] != 1 && lhs_dims[0] == 1;
-    const auto rhs_broadcast_x = dst_dims[0] != 1 && rhs_dims[0] == 1;
-    const auto lhs_broadcast_y = dst_dims[1] != 1 && lhs_dims[1] == 1;
-    const auto rhs_broadcast_y = dst_dims[1] != 1 && rhs_dims[1] == 1;
-    const auto lhs_broadcast_z = dst_dims[2] != 1 && lhs_dims[2] == 1;
-    const auto rhs_broadcast_z = dst_dims[2] != 1 && rhs_dims[2] == 1;
-
-    const auto lhs_broadcast_yz = lhs_broadcast_y && lhs_broadcast_z;
-    const auto rhs_broadcast_yz = rhs_broadcast_y && rhs_broadcast_z;
-
-    lut["lhs_n0"]          = (lhs_broadcast_x) ? "1" : "N0";
-    lut["lhs_start_ind_0"] = (lhs_broadcast_x) ? "0" : "g_ind_0";
-    lut["rhs_n0"]          = (rhs_broadcast_x) ? "1" : "N0";
-    lut["rhs_start_ind_0"] = (rhs_broadcast_x) ? "0" : "g_ind_0";
-
-    lut["lhs_m0"]          = (lhs_broadcast_yz) ? "1" : "M0";
-    lut["lhs_start_ind_1"] = (lhs_broadcast_yz) ? "0" : "g_ind_1";
-    lut["rhs_m0"]          = (rhs_broadcast_yz) ? "1" : "M0";
-    lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
-
-    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : "";
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateElementwiseBinary::get_build_options(const ComponentGroup &comp_group) const
-{
-    CLBuildOptions build_opts{};
-    /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_lhs->data_type()));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateElementwiseBinary::get_config_id() const
-{
-    std::string config_id{};
-    config_id += lower_string(string_from_data_type(_dst->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-    config_id += "_";
-    config_id += lower_string(string_from_data_layout(_dst->data_layout()));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateElementwiseBinary::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    TensorShape output_shape = _dst->tensor_shape();
-    // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
-    // This is in line with the collapsing convention used by operators like Conv2d
-    output_shape.collapse(2U, 1U);
-    const unsigned int num_elems_processed_per_iteration =
-        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
-
-    return win;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
deleted file mode 100644
index 991c0eca44..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateElementwiseBinary final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentElementwiseBinary::Attributes;
-
-    /** Constructor
-     *
-     * Similar to @ref ClComponentElementwiseBinary::validate()
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateElementwiseBinary &operator=(const ClTemplateElementwiseBinary &elementwise) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateElementwiseBinary(ClTemplateElementwiseBinary &&elementwise) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateElementwiseBinary &operator=(ClTemplateElementwiseBinary &&elementwise) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_lhs;
-    const ITensorInfo *_rhs;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
deleted file mode 100644
index 522c33a022..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-constexpr unsigned int serial_vector_size = 8;
-} // namespace
-ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
-                                                                   const ArgumentPack<ITensorInfo> &tensors,
-                                                                   const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_1);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_sum);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_dst);
-}
-
-std::string ClTemplateLogits1DMaxShiftExpSum::get_name() const
-{
-    return "logits_1d_max_shift_exp_sum";
-}
-
-std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-#define VEC_TYPE VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-{
-    __global uchar *src_addr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + g_ind_1 * {{src}}_stride_y + g_ind_2 * {{src}}_stride_z;
-    __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + g_ind_1 * {{dst}}_stride_y + g_ind_2 * {{dst}}_stride_z;
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT({{sum}});
-    VEC_TYPE max_val_vec = (VEC_TYPE)({{MINVAL}});
-)_";
-
-    const bool beta_defined = (_attributes.beta() != 1.f);
-
-    if (beta_defined)
-    {
-        code += R"_(
-    VEC_TYPE beta = (VEC_TYPE){{BETA}};
-)_";
-    }
-
-    constexpr unsigned int _serial_vector_size = 8;
-    const unsigned int     reduction_dim_size  = _src->dimension(0);
-    const unsigned int     vector_size         = adjust_vec_size(_serial_vector_size, reduction_dim_size);
-    const bool             non_multiple_of_n0  = ((reduction_dim_size % vector_size) != 0);
-
-    if (non_multiple_of_n0)
-    {
-        code += R"_(
-    VEC_TYPE data    = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)PARTIAL_N0 > VEC_OFFS(SELECT_DATA_TYPE({{DATA_TYPE}}), N0);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)({{MINVAL}}), data, widx));
-)_";
-    }
-
-    code += R"_(
-    for(uint i = PARTIAL_N0; i < {{SRC_WIDTH}}; i += N0)
-    {
-        VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(src_addr + i * sizeof({{DATA_TYPE}})));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    {{DATA_TYPE}} max_val = MAX_REDUCE(max_val_vec, N0);
-    VEC_TYPE sum1D = 0;
-)_";
-
-    if (non_multiple_of_n0)
-    {
-        code += R"_(
-    data -= max_val;
-)_";
-        if (beta_defined)
-        {
-            code += R"_(
-    data *= beta;
-)_";
-        }
-
-        if (_attributes.is_log_softmax())
-        {
-            code += R"_(
-    VSTORE_PARTIAL(N0, PARTIAL_N0)
-    (data, 0, (__global {{DATA_TYPE}} *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-)_";
-        }
-        else
-        {
-            code += R"_(
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(N0, PARTIAL_N0)
-    (data, 0, (__global {{DATA_TYPE}} *)dst_addr);
-)_";
-        }
-
-        code += R"_(
-    sum1D += data;
-)_";
-    }
-    code += R"_(
-    for(uint i = PARTIAL_N0; i < {{SRC_WIDTH}}; i += N0)
-    {
-        VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(src_addr + i * sizeof({{DATA_TYPE}})));
-        data -= max_val;
-)_";
-
-    if (beta_defined)
-    {
-        code += R"_(
-    data *= beta;
-)_";
-    }
-
-    if (_attributes.is_log_softmax())
-    {
-        code += R"_(
-    VSTORE(N0)
-    (data, 0, (__global {{DATA_TYPE}} *)(dst_addr + i * sizeof({{DATA_TYPE}})));
-    data = exp(data);
-)_";
-    }
-    else
-    {
-        code += R"_(
-    data = exp(data);
-    VSTORE(N0)
-    (data, 0, (__global {{DATA_TYPE}} *)(dst_addr + i * sizeof({{DATA_TYPE}})));
-)_";
-    }
-
-    code += R"_(
-    sum1D += data;
-    }
-)_";
-
-    code += R"_(
-    *((__global {{DATA_TYPE}} *)sum.ptr) = SUM_REDUCE(sum1D, N0);
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable,
-                                                         const ComponentGroup   &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
-
-    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
-}
-
-TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable,
-                                                     const ComponentGroup         &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["sum"] = vtable.get_variable(_sum);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    const DataType data_type = _src->data_type();
-
-    lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
-    lut["BETA"]      = float_to_string_with_full_precision(_attributes.beta());
-    lut["MINVAL"]    = (data_type == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX");
-    lut["SRC_WIDTH"] = support::cpp11::to_string(_src->dimension(0));
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    CLBuildOptions build_opts{};
-
-    const unsigned int reduction_dim_size = _src->dimension(0);
-    const unsigned int vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size)));
-
-    return build_opts;
-}
-
-std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const
-{
-    std::string config_id = get_name();
-
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += string_from_data_type(_src->data_type());
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    Window win = calculate_max_window(*_dst, Steps(_src->dimension(0)));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
deleted file mode 100644
index ac9ddaa9d4..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DMAXSHIFTEXPSUM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DMAXSHIFTEXPSUM
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateLogits1DMaxShiftExpSum final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentLogits1DMaxShiftExpSum::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
-                                     const ArgumentPack<ITensorInfo> &tensors,
-                                     const Attributes                &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateLogits1DMaxShiftExpSum &operator=(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateLogits1DMaxShiftExpSum(ClTemplateLogits1DMaxShiftExpSum &&) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateLogits1DMaxShiftExpSum &operator=(ClTemplateLogits1DMaxShiftExpSum &&) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src; // input
-    const ITensorInfo *_sum; // exponentiated and summed input
-    const ITensorInfo *_dst; // exponentiated input
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DMAXSHIFTEXPSUM */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
deleted file mode 100644
index 7d7c3e6673..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId                      id,
-                                               const ArgumentPack<ITensorInfo> &tensors,
-                                               const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_sum);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_dst);
-}
-
-std::string ClTemplateLogits1DNorm::get_name() const
-{
-    return "logits_1d_norm";
-}
-
-std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-{
-    const int x_offs = g_ind_0 * sizeof({{DATA_TYPE}});
-    __global uchar *src_addr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + x_offs + g_ind_1 * {{src}}_stride_y + g_ind_2 * {{src}}_stride_z;
-    __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + x_offs + g_ind_1 * {{dst}}_stride_y + g_ind_2 * {{dst}}_stride_z;
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP({{sum}});
-)_";
-    // Load max value of 1D logits vector (row)
-    code += R"_(
-    {{DATA_TYPE}} sum_val = *((__global {{DATA_TYPE}} *)offset(&sum, 0, g_ind_1));
-    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-    data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
-)_";
-
-    if (_attributes.is_log_softmax())
-    {
-        code += R"_(
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-)_";
-    }
-    else
-    {
-        code += R"_(
-    data0 /= sum_val;
-)_";
-    }
-
-    code += R"_(
-    STORE_VECTOR_SELECT(data, {{DATA_TYPE}}, dst_addr, N0, PARTIAL_N0, PARTIAL_N0 != 0 && g_ind_0 == 0);
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
-
-    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
-}
-
-TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["sum"] = vtable.get_variable(_sum);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    const DataType data_type = _src->data_type();
-
-    lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateLogits1DNorm::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    CLBuildOptions build_opts{};
-
-    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((_src->dimension(0) % n0)));
-
-    return build_opts;
-}
-
-std::string ClTemplateLogits1DNorm::get_config_id() const
-{
-    std::string config_id = get_name();
-
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += string_from_data_type(_src->data_type());
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateLogits1DNorm::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateLogits1DNorm::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    constexpr unsigned int serial_vector_size = 16;
-    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, _src->dimension(0));
-
-    Window win = calculate_max_window(*_src, Steps(vector_size));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h
deleted file mode 100644
index 5a74be5842..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DNORM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DNORM
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateLogits1DNorm final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentLogits1DNorm::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateLogits1DNorm(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateLogits1DNorm(const ClTemplateLogits1DNorm &) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateLogits1DNorm &operator=(const ClTemplateLogits1DNorm &) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateLogits1DNorm(ClTemplateLogits1DNorm &&) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateLogits1DNorm &operator=(ClTemplateLogits1DNorm &&) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src; // exponentiated input
-    const ITensorInfo *_sum; // exponentiated and summed input
-    const ITensorInfo *_dst; // normalization of input with _sum
-
-    Attributes _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DNORM */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
deleted file mode 100644
index 8936db6abe..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright (c) 2023-2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplatePool2d.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-// Shape indexes for NHWC Datalayout
-constexpr static int32_t height_idx  = 2;
-constexpr static int32_t width_idx   = 1;
-constexpr static int32_t channel_idx = 0;
-} // namespace
-ClTemplatePool2d::ClTemplatePool2d(ComponentId                      id,
-                                   const ArgumentPack<ITensorInfo> &tensors,
-                                   const Attributes                &attributes,
-                                   const Settings                  &settings)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplatePool2d::get_name() const
-{
-    return "pool2d";
-}
-
-std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    // Condition to use 2x2 optimized kernel
-    if (_attributes.pool_size() == Size2D(2, 2))
-    {
-        return get_2x2_kernel_code();
-    }
-    else
-    {
-        return get_MxN_kernel_code();
-    }
-}
-
-std::string ClTemplatePool2d::get_MxN_kernel_code() const
-{
-    const auto pool_type          = _attributes.pool_type();
-    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
-
-    // Define pool op macro.
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
-                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
-
-    // Kernel start
-    // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// OUT(dst, accum)      {{dst}}
-
-{
-    const int idx_out_c = g_ind_0;
-    const int idx_out_w = g_ind_1;
-)_";
-
-    // Add macro for POOL_OP
-    code += "\n" + pool_op + "\n";
-
-    code += R"_(
-    const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
-    const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
-)_";
-
-    // Define common variables.
-    code += R"_(
-    __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
-
-    __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
-
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-    res0 = {{INITIAL_VALUE}};
-
-    const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
-    const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
-
-    const int pool_x_s = max((int)0, -idx_in_w);
-    const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
-    const int pool_y_s = max((int)0, -idx_in_h);
-    const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
-)_";
-
-    // Determine filter size depending on if padding is excluded or not
-    if (_attributes.exclude_padding())
-    {
-        code += R"_(
-    const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
-)_";
-    }
-    else
-    {
-        code += R"_(
-    const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
-)_";
-    }
-
-    // Loop through pool size
-    // if global pooling
-    if (_attributes.pool_size().x() == _src->dimension(width_idx) &&
-        _attributes.pool_size().y() == _src->dimension(height_idx))
-    {
-        // Begin loop
-        code += R"_(
-    // Global pooling path
-    for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
-    {
-    #pragma unroll 8
-        for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
-        {
-            VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-            data0;
-)_";
-    }
-    else // if local pooling size
-    {
-        code += R"_(
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-    #pragma unroll 8
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-            VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-            data0;
-)_";
-    } // end else
-
-    // if condition inside loop - use 32bit acc if mixed_precision.
-    // End loop through pooling section.
-    if (fp_mixed_precision)
-    {
-        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-        code += R"_(
-            data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-)_";
-    }
-    else // load data, compute result and end loop
-    {
-        code += R"_(
-            data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-)_";
-    }
-
-    // For Pool AVG ONLY, divide pool output by filter size
-    if (pool_type == PoolingType::AVG)
-    {
-        code += R"_(
-    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
-)_";
-    }
-
-    // If mixed precision convert datatype before storing. Then end kernel.
-    if (fp_mixed_precision)
-    {
-        code += R"_(
-    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
-    STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-    else
-    {
-        // Store data
-        code += R"_(
-    STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-
-    code += R"_(
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-}
-)_";
-
-    return code;
-}
-
-std::string ClTemplatePool2d::get_2x2_kernel_code() const
-{
-    const auto  pool_type          = _attributes.pool_type();
-    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
-    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
-                                                                     : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// OUT(dst, accum)      {{dst}}
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-
-{
-    const int idx_out_c = g_ind_0;
-    const int idx_out_w = g_ind_1;
-)_";
-
-    // Add pool op macro
-    code += "\n" + pool_op + "\n";
-
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    code += R"_(
-    const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
-    const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
-)_";
-
-    code += R"_(
-    const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
-    const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
-
-    __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
-    __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
-                                           {{dst}}_stride_w;
-    const int pool_x_s = max((int)0, -idx_in_w);
-    const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
-    const int pool_y_s = max((int)0, -idx_in_h);
-    const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
-
-    const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-    const int x0 = pool_x_s + idx_in_w;
-    const int y0 = pool_y_s + idx_in_h;
-    const int x1 = pool_x_e - 1 + idx_in_w;
-    const int y1 = pool_y_e - 1 + idx_in_h;
-
-    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
-)_";
-
-    if (fp_mixed_precision)
-    {
-        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-        code += R"_(
-    data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-    data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-    data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-    data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-)_";
-    }
-    else
-    {
-        code += R"_(
-    data0         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z));
-    data1         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z));
-    data2         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z));
-    data3         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z));
-)_";
-    }
-
-    if (pool_type != PoolingType::MAX)
-    {
-        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
-        code += R"_(
-    if(filter_size != 4)
-    {
-        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
-        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
-        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
-        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
-
-        data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_s));
-        data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_s));
-        data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_e));
-        data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_e));
-    }
-)_";
-    }
-
-    code += R"_(
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-    res0 = data0;
-    res0 = POOL_OP(res0, data1);
-    res0 = POOL_OP(res0, data2);
-    res0 = POOL_OP(res0, data3);
-)_";
-
-    if (pool_type == PoolingType::AVG)
-    {
-        // If avg pooling divide result accordingly.
-        if (_attributes.exclude_padding())
-        {
-            code += R"_(
-    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
-)_";
-        }
-        else
-        {
-            code += R"_(
-    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
-)_";
-        }
-    }
-
-    // Store result
-    if (fp_mixed_precision)
-    {
-        code += R"_(
-    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
-    STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-    else
-    {
-        code += R"_(
-    STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-
-    code += R"_(
-    //------------------ END KERNEL {{meta_kernel_id}} ---------------------
-}
-#undef SELECT_TYPE
-)_";
-
-    return code;
-}
-
-void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    // Retrieve relevant data
-    const auto padding   = _attributes.pad();
-    const auto stride    = _attributes.stride();
-    const auto pool_size = _attributes.pool_size();
-    const auto data_type = _src->data_type();
-    const auto use_fp_mixed_precision =
-        (_src->data_type() == DataType::F16) && _attributes.pool_type() != PoolingType::MAX;
-    const std::string max_initial_value =
-        _settings.use_inf_as_limit() ? "(-INFINITY)"
-                                     : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
-
-    // pool specific
-    lut["STRIDE_X"]    = stride.x();
-    lut["STRIDE_Y"]    = stride.y();
-    lut["PAD_X"]       = padding.left;
-    lut["PAD_Y"]       = padding.top;
-    lut["POOL_SIZE_X"] = pool_size.width;
-    lut["POOL_SIZE_Y"] = pool_size.height;
-
-    // Datatypes and variables
-    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type(
-        (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
-    lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
-    lut["SRC_WIDTH"]     = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
-    lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? max_initial_value : std::string("0");
-
-    // Tensor specific data
-    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
-
-    return lut;
-}
-
-CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
-{
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts{};
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplatePool2d::get_config_id() const
-{
-    const DataType   data_type   = _src->data_type();
-    const DataLayout data_layout = _src->data_layout();
-
-    std::string config_id{};
-    config_id += "pooling_layer_2d_";
-    config_id += lower_string(string_from_data_type(data_type));
-    config_id += "_";
-    config_id += lower_string(string_from_data_layout(data_layout));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplatePool2d::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h", "repeat.h"};
-}
-
-Window ClTemplatePool2d::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    const auto         output_shape = _dst->tensor_shape();
-    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
-
-    // Create and configure kernel window
-    auto win = calculate_max_window(output_shape, Steps(vec_size));
-    win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
-    return win;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
deleted file mode 100644
index d1d3c01669..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D
-
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplatePool2d final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentPool2d::Attributes;
-    using Settings   = ClComponentPool2d::Settings;
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     * @param[in] settings   Component settings
-     */
-    ClTemplatePool2d(ComponentId                      id,
-                     const ArgumentPack<ITensorInfo> &tensors,
-                     const Attributes                &attributes,
-                     const Settings                  &settings);
-
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplatePool2d(const ClTemplatePool2d &direct_conv2d) = delete;
-
-    /** Prevent instances of this class from being copied */
-    ClTemplatePool2d &operator=(const ClTemplatePool2d &direct_conv2d) = delete;
-
-    /** Allow instances of this class to be move constructed */
-    ClTemplatePool2d(ClTemplatePool2d &&direct_conv2d) = default;
-
-    /** Allow instances of this class to be moved */
-    ClTemplatePool2d &operator=(ClTemplatePool2d &&direct_conv2d) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    /** Generate pooling kernel template code optimized for 2x2 pooling
-     *
-     * @return std::String Component code
-     */
-    std::string get_2x2_kernel_code() const;
-
-    /** Generate generalised pooling kernel template code for MxN pooling
-     *
-     * @return std::String Component code
-     */
-    std::string get_MxN_kernel_code() const;
-
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-    Settings           _settings;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
deleted file mode 100644
index c882353fcb..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateReshape.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-constexpr unsigned int vector_size_byte_opencl = 16;
-
-ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateReshape::get_name() const
-{
-    return "reshape";
-}
-
-std::string ClTemplateReshape::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    std::string code;
-
-    code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-
-// IN(src)              {{src}}
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-{
-    __global uchar * base_src_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes;
-    const int tile_vertical_idx = g_ind_1 * {{arg_dst}}_c + g_ind_2 * {{arg_dst}}_c * {{arg_dst}}_w;
-    LOOP_UNROLLING(int, _m0, 0, 1, M0,
-    {
-        const int row_idx = _m0 * {{arg_dst}}_c + tile_vertical_idx;
-        const int tile_horizontal_idx = g_ind_0 + row_idx;
-        LOOP_UNROLLING(int, _n0, 0, 1, N0,
-        {
-            {{src}}_ptr = base_src_ptr;
-            const int linear_idx = tile_horizontal_idx + _n0;
-            const int in_id_x = linear_idx % {{src}}_c;
-            const int in_id_y = (linear_idx / {{src}}_c) % {{src}}_w;
-            const int in_id_z = linear_idx / ({{src}}_c * {{src}}_w);
-            {{src}}_ptr += in_id_x * sizeof({{DATA_TYPE}}) + in_id_y * {{src}}_stride_y + in_id_z * {{src}}_stride_z;
-            {{dst}}[_m0].s[_n0] = *((__global {{DATA_TYPE}} *){{src}}_ptr);
-        })
-    })
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-    })
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    return code;
-}
-
-void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src,
-                            GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
-}
-
-TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"]            = vtable.get_variable(_src);
-    lut["dst"]            = vtable.get_variable(_dst);
-    lut["arg_dst"]        = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["meta_kernel_id"] = id();
-    lut["DATA_TYPE"]      = get_cl_type_from_data_type(_dst->data_type());
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateReshape::get_build_options(const ComponentGroup &comp_group) const
-{
-    CLBuildOptions     build_opts{};
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateReshape::get_config_id() const
-{
-    std::string config_id{};
-    config_id += lower_string(string_from_data_type(_dst->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateReshape::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateReshape::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    const unsigned int n0  = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
deleted file mode 100644
index 838a21db6d..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateReshape final : public IGpuTemplateComponentWriter
-{
-public:
-    /** Constructor
-     *
-     * @param[in] id      Component id
-     * @param[in] tensors Tensor arguments to the components
-     */
-    ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateReshape(const ClTemplateReshape &reshape) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateReshape &operator=(const ClTemplateReshape &reshape) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateReshape(ClTemplateReshape &&reshape) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateReshape &operator=(ClTemplateReshape &&reshape) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
deleted file mode 100644
index 846c712ceb..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ClTemplateResize.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateResize::ClTemplateResize(ComponentId                         id,
-                                   const ArgumentPack<ITensorInfo>    &tensors,
-                                   const ClTemplateResize::Attributes &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateResize::get_name() const
-{
-    return _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "resize_bilinear" : "resize_nearest";
-}
-
-std::string ClTemplateResize::get_component_code(const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-TILE(uint, 1, 1, g_dst_indirect_y);
-{
-    const int yo = g_ind_2 % {{arg_dst}}_h;
-    const int bout = g_ind_2 / {{arg_dst}}_h;
-)_";
-
-    if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
-        {
-            code += R"_(
-    float xi_f = (g_ind_1 * {{SCALE_X}});
-    float yi_f = (yo * {{SCALE_Y}});
-)_";
-        }
-        else
-        {
-            code += R"_(
-    float xi_f = ((g_ind_1 + 0.5f) * {{SCALE_X}});
-    float yi_f = ((yo + 0.5f) * {{SCALE_Y}});
-)_";
-        }
-
-        if (_attributes.align_corners())
-        {
-            code += R"_(
-    xi_f = round(xi_f);
-    yi_f = round(yi_f);
-)_";
-        }
-
-        code += R"_(
-    const int xi0 = clamp((int)xi_f, 0, (int){{src}}_w - 1);
-    const int yi0 = clamp((int)yi_f, 0, (int){{src}}_h - 1);
-
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}});
-)_";
-    }
-    else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
-    {
-        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
-        {
-            code += R"_(
-    float xi_f = (g_ind_1 * {{SCALE_X}});
-    float yi_f = (yo * {{SCALE_Y}});
-)_";
-        }
-        else
-        {
-            code += R"_(
-    float xi_f = ((g_ind_1 + 0.5f) * {{SCALE_X}} - 0.5f);
-    float yi_f = ((yo + 0.5f) * {{SCALE_Y}} - 0.5f);
-)_";
-        }
-
-        code += R"_(
-    const int xi = (int)floor(xi_f);
-    const int yi = (int)floor(yi_f);
-
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in00);
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in01);
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in10);
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in11);
-
-    in00[0].v = {{CONSTANT_VALUE}};
-    in01[0].v = {{CONSTANT_VALUE}};
-    in10[0].v = {{CONSTANT_VALUE}};
-    in11[0].v = {{CONSTANT_VALUE}};
-
-    const int xi0  = clamp(xi, 0, (int){{src}}_w - 1);
-    const int yi0  = clamp(yi, 0, (int){{src}}_h - 1);
-    const int xi1  = clamp(xi + 1, 0, (int){{src}}_w - 1);
-    const int yi1  = clamp(yi + 1, 0, (int){{src}}_h - 1);
-
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in00);
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in01);
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in10);
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11);
-)_";
-
-        if (is_data_type_float(_src->data_type()))
-        {
-            code += R"_(
-    const {{SRC_DATA_TYPE}} a  = ({{SRC_DATA_TYPE}})(xi_f - (float)xi);
-    const {{SRC_DATA_TYPE}} b  = ({{SRC_DATA_TYPE}})(1.f - a);
-    const {{SRC_DATA_TYPE}} a1 = ({{SRC_DATA_TYPE}})(yi_f - (float)yi);
-    const {{SRC_DATA_TYPE}} b1 = ({{SRC_DATA_TYPE}})(1.f - a1);
-
-    // Calculate the output
-    {{dst}}[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
-)_";
-        }
-        else
-        {
-            code += R"_(
-    const float a  = (xi_f - (float)xi);
-    const float b  = (1.f - a);
-    const float a1 = (yi_f - (float)yi);
-    const float b1 = (1.f - a1);
-
-    {{dst}}[0].v = CONVERT_SAT(
-        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
-        (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
-        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
-        (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0));
-)_";
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported interpolation policy");
-    }
-
-    code += R"_(
-    g_dst_indirect_y[0].v = g_ind_1 + (yo * (int)({{arg_dst}}_w)) + bout * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateResize::declare_variables(GpuKernelVariableTable                            &vtable,
-                                         const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable                      &vtable,
-                                     const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"]  = id();
-    lut["SRC_DATA_TYPE"]   = get_cl_type_from_data_type(_src->data_type());
-    lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    lut["DST_DATA_TYPE"]   = get_cl_type_from_data_type(_dst->data_type());
-    lut["CONSTANT_VALUE"]  = string_from_pixel_value(0, _src->data_type());
-
-    const float scale_x =
-        scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
-    const float scale_y =
-        scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
-
-    lut["SCALE_X"] = float_to_string_with_full_precision(scale_x);
-    lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y);
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateResize::get_build_options(const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    const Window       root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-    const unsigned int partial_n0  = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts;
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateResize::get_config_id() const
-{
-    std::string config_id{};
-
-    config_id += "resize_";
-    config_id +=
-        (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
-    config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : "");
-    config_id += "_";
-    config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft");
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(2));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(3));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateResize::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateResize::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    const unsigned int n0  = adjust_vec_size(16 / _src->element_size(), _src->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h
deleted file mode 100644
index 4c69007185..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESIZE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESIZE
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateResize final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentResize::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Destructor */
-    ~ClTemplateResize() override = default;
-
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateResize(const ClTemplateResize &resize) = delete;
-
-    /** Prevent instances of this class from being copied */
-    ClTemplateResize &operator=(const ClTemplateResize &resize) = delete;
-
-    /** Allow instances of this class to be move constructed */
-    ClTemplateResize(ClTemplateResize &&resize) = default;
-
-    /** Allow instances of this class to be moved */
-    ClTemplateResize &operator=(ClTemplateResize &&resize) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESIZE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
deleted file mode 100644
index d0ec91e0a9..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateStore.h"
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-}
-
-std::string ClTemplateStore::get_name() const
-{
-    return "store";
-}
-
-std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    return R"_(
-//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
-{
-    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
-    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, g_dst_indirect_y);
-//------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
-}
-
-)_";
-}
-
-void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"]  = id();
-    lut["DST_TENSOR_TYPE"] = "BUFFER";
-    lut["DST_DATA_TYPE"]   = _dst->data_type();
-
-    ARM_COMPUTE_UNUSED(comp_group);
-    return lut;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
deleted file mode 100644
index b8c82ceadd..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateStore final : public IGpuTemplateComponentWriter
-{
-public:
-    /** Constructor
-     *
-     * @param[in] id      Component id
-     * @param[in] tensors Tensor arguments to the components
-     */
-    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateStore(const ClTemplateStore &store) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateStore(ClTemplateStore &&store) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
deleted file mode 100644
index d3d7c8db83..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateWriter.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
-std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
-{
-    std::string replaced_code    = "";
-    bool        scanning_pattern = false;
-    std::string pattern_found    = "";
-    for (size_t i = 0; i < code_template.size() - 1; ++i)
-    {
-        if (!scanning_pattern)
-        {
-            if (code_template[i] == '{' && code_template[i + 1] == '{')
-            {
-                i += 1;
-                scanning_pattern = true;
-                pattern_found    = "";
-            }
-            else
-            {
-                replaced_code += code_template[i];
-            }
-        }
-        else
-        {
-            if (code_template[i] == '}' && code_template[i + 1] == '}')
-            {
-                i += 1;
-                scanning_pattern = false;
-                std::string err  = "Pattern " + pattern_found + " not found in tags";
-                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
-                replaced_code += tags.find(pattern_found)->second.value;
-            }
-            else
-            {
-                pattern_found += code_template[i];
-            }
-        }
-    }
-
-    return replaced_code;
-}
-ClTemplateWriter::~ClTemplateWriter()
-{
-}
-ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components}
-{
-}
-std::string ClTemplateWriter::get_name()
-{
-    return write_kernel_name();
-}
-std::string ClTemplateWriter::get_code()
-{
-    return write_code();
-}
-std::string ClTemplateWriter::get_config_id()
-{
-    std::string config_id = get_name();
-    for (const auto &comp : _components)
-    {
-        config_id += "--" + comp->template_writer()->get_config_id() + "--";
-    }
-
-    return config_id;
-}
-
-CLBuildOptions ClTemplateWriter::get_build_options()
-{
-    CLBuildOptions build_opts{};
-
-    for (const auto &comp : _components)
-    {
-        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
-    }
-
-    return build_opts;
-}
-
-Window ClTemplateWriter::get_window() const
-{
-    const auto root_comp = _components.get_root_component();
-    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
-    return root_comp->template_writer()->get_window();
-}
-
-std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
-{
-    // Assemble GpuKernelArguments
-    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for (const auto t : _components.get_argument_tensors())
-    {
-        tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info});
-    }
-    return tensors;
-}
-
-std::string ClTemplateWriter::write_code()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
-
-    // These data structures will hold the data from all the components in the blueprint
-    std::set<std::string>    headers_list{};
-    std::set<std::string>    additional_macros{};
-    std::vector<std::string> component_codes{}; // vector because order matters
-
-    // Pass 1: Declare all kernel variables
-    for (auto &component : _components)
-    {
-        component->template_writer()->declare_variables(_vtable, _components);
-    }
-    // Pass 2: Generate component codes
-    for (auto &component : _components)
-    {
-        const auto component_writer       = component->template_writer();
-        auto       curr_headers_list      = component_writer->get_headers_list();
-        auto       curr_additional_macros = component_writer->get_additional_macros();
-        auto       curr_component_code    = component_writer->get_component_code(_components);
-        const auto var_lut                = component_writer->get_tag_lut(
-                           _vtable,
-                           _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
-        component_codes.push_back(replace_tags(curr_component_code, var_lut));
-
-        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
-        if (!additional_macros.empty()) // Some components might not have any
-        {
-            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
-        }
-    }
-
-    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
-    std::string code = "";
-
-    for (auto &header : headers_list)
-    {
-#if defined(EMBEDDED_KERNELS)
-        code += CLKernelLibrary::get().get_program(header).first;
-#else  // defined(EMBEDDED_KERNELS)
-        code += "#include \"" + header + "\"\n";
-#endif // defined(EMBEDDED_KERNELS)
-    }
-
-    for (auto &macros : additional_macros)
-    {
-        code += macros;
-    }
-
-    auto arguments = _components.get_argument_tensors();
-    std::sort(arguments.begin(), arguments.end(),
-              [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); });
-    code += write_kernel_signature(_vtable.get_variable_list(arguments));
-
-    code += "\n{\n\n";
-
-    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
-    code += write_global_section();
-    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
-
-    {
-        const auto        tiles = _components.get_tiles();
-        std::stringstream tiles_ss;
-
-        tiles_ss << "    //------------------ START TILE DECLARATION ---------------------\n";
-
-        for (auto tile : tiles)
-        {
-            const auto var       = _vtable.get_variable(tile);
-            const auto data_type = get_cl_type_from_data_type(tile->data_type());
-            const auto var_name  = var.uniq_name;
-
-            tiles_ss << "    TILE(" << data_type << ", M0, N0, " << var_name << ");\n";
-        }
-
-        tiles_ss << "    //------------------ END TILE DECLARATION ---------------------\n";
-
-        code += tiles_ss.str();
-    }
-
-    for (const auto &component_code : component_codes)
-    {
-        code += component_code;
-        code += "\n";
-    }
-
-    code += "}\n";
-
-    return code;
-}
-std::string ClTemplateWriter::write_global_section() const
-{
-    const auto dst_info   = _components.get_any_dst_tensor();
-    const auto dst_w      = dst_info->dimension(0);
-    const auto tile_w     = std::max(1, get_window().x().step());
-    const auto tile_h     = std::max(1, get_window().y().step());
-    auto       leftover_w = dst_w % tile_w;
-
-    std::string code = "";
-    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " +
-            std::to_string(leftover_w) + ");\n";
-    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
-    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
-
-    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
-    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
-
-    return code;
-}
-std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
-{
-    std::string code;
-    switch (var.kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Vector:
-        {
-            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image:
-        {
-            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_3D:
-        {
-            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
-            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        {
-            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
-            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
-        {
-            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-        {
-            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_3D:
-        {
-            code += "\n    TENSOR3D_DECLARATION(" + var.uniq_name + ")";
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
-        }
-    }
-    return code;
-}
-std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
-{
-    std::string code = "\n__kernel void " + write_kernel_name() + "(";
-
-    for (int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
-    {
-        code += write_argument_declaration(argument_list[i]) + ",";
-    }
-    if (static_cast<int>(argument_list.size()) - 1 >= 0)
-    {
-        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
-    }
-
-    code += ')';
-
-    return code;
-}
-std::string ClTemplateWriter::write_kernel_name() const
-{
-    if (_components.empty())
-    {
-        return "empty_kernel";
-    }
-    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
-    for (size_t i = 1; i < _components.size(); ++i)
-    {
-        name += "___";
-        name += _components[i]->template_writer()->get_name();
-    }
-
-    return name;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
deleted file mode 100644
index 83f617b6c6..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Use a templated-string-based method to write kernel code
- *  It stitches the component code templates together based on the valid fusion configuration.
- *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
- */
-class ClTemplateWriter : public IGpuKernelWriter
-{
-public:
-    /** Instantiates a kernel code string from the kernel code template
-     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
-     *
-     * @param[in] code_template Kernel code template
-     * @param[in] tags          Tag lookup table
-     *
-     * @return std::string  Instantiated kernel string
-     */
-    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
-    /** Default constructor */
-    ClTemplateWriter() = default;
-    /** Constructor
-     *
-     * @param[in] components Kernel component group from which the kernel will be generated
-     */
-    ClTemplateWriter(const GpuKernelComponentGroup &components);
-    /** Destructor */
-    ~ClTemplateWriter() override;
-    /** Generate kernel name */
-    std::string get_name() override;
-    /** Generate kernel code */
-    std::string get_code() override;
-    /** Generate build options */
-    CLBuildOptions get_build_options() override;
-    /** Generate config id string of the entire kernel. This is used for tuning */
-    std::string get_config_id() override;
-    /** Generate execution window */
-    Window get_window() const override;
-    /** Get the kernel argument lists of the kernel*/
-    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
-
-private:
-    std::string write_kernel_name() const;
-    std::string write_code();
-    std::string write_global_section() const;
-    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
-    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
-
-private:
-    GpuKernelComponentGroup _components{};
-    GpuKernelVariableTable  _vtable{};
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index 80dcaa8f90..453983c077 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -63,7 +63,7 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(INTEGRATION)
 TEST_SUITE(DYNAMIC_FUSION)
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Conv2d is not ported to ckw yet. COMPMID-6259
+
 TEST_CASE(Conv2d, framework::DatasetMode::ALL)
 {
     /* Computation:
@@ -156,7 +156,7 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL)
         0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
     validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
 }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
 TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
 {
     /* Computation:
@@ -368,8 +368,9 @@ TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
     validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Conv2d is not ported to ckw yet. COMPMID-6259
-TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::ALL)
+/// TODO: COMPMID-6593 : This integration test fails with CKW backend.
+/// It was not enabled for CKW before, therefore went unnoticed.
+TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::DISABLED)
 {
     //   (tensor0)
     //       |
@@ -580,7 +581,6 @@ TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::ALL)
     constexpr RelativeTolerance<float> tolerance(0.001f);
     validate(CLAccessor(tensor6), ref_mul_dst_nchw, tolerance);
 }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 TEST_SUITE(Invalid_Fusion_Should_Fail)
 TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
index 40e1ea8929..2f8c639cea 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
@@ -290,7 +290,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
+
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuDepthwiseConv2dFixture<half>,
@@ -313,7 +313,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Generic)
@@ -336,7 +335,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
+
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuDepthwiseConv2dFixture<half>,
@@ -359,7 +358,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // Generic
 TEST_SUITE_END() // FP16
 
@@ -385,7 +383,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 
 FIXTURE_DATA_TEST_CASE(RunSmall,
@@ -409,7 +406,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Generic)
@@ -445,7 +441,6 @@ FIXTURE_DATA_TEST_CASE(RunLargeKernelSize,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuDepthwiseConv2dFixture<float>,
@@ -468,7 +463,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // Generic
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
index 96b79679c3..82d66ca6ce 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/LargeMatMulDataset.h"
@@ -333,4 +333,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
index e537826c71..be816b32b3 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -217,4 +217,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
index 43617fe1be..a1495cf014 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
+
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ReshapeLayerDataset.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -82,7 +82,7 @@ using DynamicFusionGpuReshapeLayerFixture =
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<float>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::F32)))
 {
@@ -94,7 +94,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<half>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::F16)))
 {
@@ -106,7 +106,7 @@ TEST_SUITE_END() // F16
 TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<uint8_t>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::U8)))
 {
@@ -118,7 +118,7 @@ TEST_SUITE_END() // U8
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<int8_t>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::S8)))
 {
@@ -130,7 +130,7 @@ TEST_SUITE_END() // S8
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<int16_t>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::S16)))
 {
@@ -145,5 +145,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
index b7cb6bace6..8f5a1ed14a 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
+
 #include "arm_compute/core/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
 
@@ -46,62 +46,70 @@ namespace validation
 RelativeTolerance<half>  tolerance_f16(half(0.2));
 RelativeTolerance<float> tolerance_f32(0.001f);
 
+using framework::dataset::make;
+
+/// TODO: COMPMID-6713
+/// Softmax is not implemented in CKW. Therefore, the tests are DISABLED.
+/// Enable the tests when Softmax is implemented in CKW.
+
 TEST_SUITE(CL)
 TEST_SUITE(DYNAMIC_FUSION)
 TEST_SUITE(SOFTMAX)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::S32), // Unsupported data type
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM16), // Unsupported data type
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", {
-                                                  0,
-                                                  0,
-                                                  1,  // Invalid as axis != 0
-                                                  0,
-                                                  0,
-                                                  0,
-                                                  -3, // Invalid as axis != 0
-                                                  2,  // Invalid as axis != 0
-                                                  1,  // Invalid as axis != 0
-                                                  -1, // Invalid as axis != 0
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, false, false, false, false, false, false})),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::DISABLED,
+    zip(
+        make("InputInfo", {
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::S32), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("OutputInfo",{
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM16), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("beta", {
+            1.0,
+            2.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+        }),
+        make("axis", {
+            0,
+            0,
+            1,  // Invalid as axis != 0
+            0,
+            0,
+            0,
+            -3, // Invalid as axis != 0
+            2,  // Invalid as axis != 0
+            1,  // Invalid as axis != 0
+            -1, // Invalid as axis != 0
+        }),
+        make("Expected", { false, false, false, true, false, false, false, false, false, false})),
+        input_info, output_info, beta, axis, expected)
 {
     // Create a new workload sketch
     CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
@@ -122,33 +130,39 @@ using DynamicFusionSoftmaxLayerFixture = DynamicFusionSoftmaxValidationFixture<C
 TEST_SUITE(FLOAT)
 TEST_SUITE(FP32)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 
-FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 
-FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -156,33 +170,39 @@ FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework
 TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 
 
-FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 
 
-FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -197,5 +217,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
author	Gunes Bayir <gunes.bayir@arm.com>	2024-02-07 15:34:45 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	2024-02-09 15:59:45 +0000
commit	0ee13afc4429411de9a05ba4c2ff8a580784b568 (patch)
tree	c9ee1acf684d52b92ffb7500b0b65eee8377ce45
parent	a3e1b50588b89a2c0c67da2679728a422fc16402 (diff)
download	ComputeLibrary-0ee13afc4429411de9a05ba4c2ff8a580784b568.tar.gz