115 files changed, 230 insertions, 15816 deletions
diff --git a/Android.bp b/Android.bp
index 2983e2e21d..670138b209 100644
--- a/Android.bp
+++ b/Android.bp
@@ -172,6 +172,7 @@ cc_library_static {
     proprietary: true,
     local_include_dirs: ["build/android-arm64v8a/src/core",
                          "build/android-arm64v8a/src/core/CL",
+                         "compute_kernel_writer/include",
                          "src/core/common",
                          "src/core/helpers",
                          "src/core/NEON/kernels/arm_gemm",
@@ -621,7 +622,6 @@ cc_library_static {
         "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
-        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
@@ -634,8 +634,6 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
@@ -657,19 +655,6 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
         "src/gpu/cl/ClContext.cpp",
         "src/gpu/cl/ClKernelLibrary.cpp",
         "src/gpu/cl/ClQueue.cpp",
diff --git a/SConscript b/SConscript
index f0c42979ce..a9986feb32 100644
--- a/SConscript
+++ b/SConscript
@@ -564,12 +564,6 @@ if env['fixed_format_kernels']:
 # Dynamic fusion
 if env['experimental_dynamic_fusion']:
     lib_files += filelist['experimental']['dynamic_fusion']['common']
-    lib_files += filelist['experimental']['dynamic_fusion']['template_writer']
-
-if "ACL_INTERNAL_TEST_CKW_IN_DF" in env["extra_cxx_flags"]:
-    if not env["experimental_dynamic_fusion"]:
-        print("To use ACL_INTERNAL_TEST_CKW_IN_DF experimental_dynamic_fusion must be set to 1")
-        Exit(1)
     lib_files += filelist['experimental']['dynamic_fusion']['ckw_driver']
 
 # Logging files
diff --git a/SConstruct b/SConstruct
index 6f498b51c8..bad85e503d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -227,9 +227,6 @@ if env['experimental_dynamic_fusion']:
     # Dynamic Fusion on GPU has a direct dependency on OpenCL and Compute Kernel Writer
     env['opencl'] = 1
 
-    # Build CKW by default
-    env["extra_cxx_flags"] += ' -DACL_INTERNAL_TEST_CKW_IN_DF'
-
 if env['opencl'] and env['embed_kernels'] and env['compress_kernels'] and env['os'] not in ['android']:
     print("Compressed kernels are supported only for android builds")
     Exit(1)
diff --git a/compute_kernel_writer/prototype/CMakeLists.txt b/compute_kernel_writer/prototype/CMakeLists.txt
deleted file mode 100644
index 439dcd3b7e..0000000000
--- a/compute_kernel_writer/prototype/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2023 Arm Limited.
-#
-# SPDX-License-Identifier: MIT
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-
-#---------------------------------------------------------------------
-# Prototype
-
-add_library(ckw_prototype
-    src/TileInfo.cpp
-    src/TensorInfo.cpp
-    src/Kernel.cpp
-    src/KernelWriter.cpp
-    src/OperandBase.cpp
-    src/TileOperand.cpp
-    src/TensorOperand.cpp
-    src/TensorTileSampler.cpp
-    src/KernelArgument.cpp
-)
-
-target_compile_options(ckw_prototype
-    PUBLIC
-    ${CKW_CXX_FLAGS}
-    "$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
-    "$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
-    "$<$<BOOL:${CKW_ENABLE_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
-    ${CMAKE_CXX_FLAGS}
-    PRIVATE
-    $<$<CONFIG:Release>:-Os>
-)
-
-target_compile_definitions(ckw_prototype PUBLIC
-    $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
-    $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ENABLE_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
-)
-
-target_include_directories(ckw_prototype
-    PUBLIC ${CMAKE_CURRENT_LIST_DIR}/include
-    PRIVATE ${CMAKE_CURRENT_LIST_DIR}
-)
-
-#---------------------------------------------------------------------
-# Examples
-
-add_library(ckw_prototype_examples_common
-    examples/common/ExampleKernelWriter.cpp
-    examples/common/ExampleScopedKernelWriter.cpp
-    examples/common/ExampleComponentArgument.cpp
-)
-
-target_link_libraries(ckw_prototype_examples_common PUBLIC ckw_prototype)
-
-add_executable(ckw_prototype_examples_add_exp_store examples/add_exp_store.cpp)
-target_link_libraries(ckw_prototype_examples_add_exp_store PUBLIC ckw_prototype_examples_common)
-
-add_executable(writer_helper examples/writer_helper.cpp)
-target_link_libraries(writer_helper PUBLIC ckw_prototype)
diff --git a/compute_kernel_writer/prototype/examples/add_exp_store.cpp b/compute_kernel_writer/prototype/examples/add_exp_store.cpp
deleted file mode 100644
index 2b640ca01b..0000000000
--- a/compute_kernel_writer/prototype/examples/add_exp_store.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/Error.h"
-#include "ckw/KernelArgument.h"
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-
-#include "common/ExampleComponentArgument.h"
-#include "common/ExampleKernelWriter.h"
-#include "common/ExampleScopedKernelWriter.h"
-#include <iostream>
-#include <vector>
-
-using namespace ckw;
-
-TensorTileSampler create_simple_sampler(ExampleScopedKernelWriter writer)
-{
-    TensorTileSampler sampler;
-
-    constexpr int32_t m0 = 4;
-    constexpr int32_t n0 = 4;
-
-    auto &gid_0 = writer->declare_tile("gid_0", DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    sampler.x(gid_0);
-    sampler.y(gid_1);
-    sampler.z(const_0);
-    sampler.b(gid_2);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1);
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-
-    return sampler;
-}
-
-void op_binary_elementwise(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
-{
-    auto lhs = operands.at(0);
-    auto rhs = operands.at(1);
-    auto dst = operands.at(2);
-
-    // Load the LHS and RHS tile and prepare the tensor sampler.
-    if (!lhs->has_tile() && !rhs->has_tile())
-    {
-        const auto sampler = create_simple_sampler(writer);
-
-        writer->op_load_once(lhs, sampler);
-        writer->op_load_once(rhs, sampler);
-    }
-    else if (lhs->has_tile())
-    {
-        const auto &sampler = lhs->tile_sampler();
-        writer->op_load_once(rhs, sampler);
-    }
-    else
-    {
-        const auto &sampler = rhs->tile_sampler();
-        writer->op_load_once(lhs, sampler);
-    }
-
-    auto       &lhs_tile = lhs->tile();
-    auto       &rhs_tile = rhs->tile();
-    const auto &sampler  = lhs->tile_sampler();
-
-    // Prepare the output tile.
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", lhs_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-
-    auto &dst_tile = dst->tile();
-
-    // Perform the operation.
-    writer->op_binary_expression(dst_tile, lhs_tile, BinaryOp::Add, rhs_tile);
-}
-
-void op_exp(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
-{
-    auto src = operands.at(0);
-    auto dst = operands.at(1);
-
-    // Load the source tile and prepare the sampler.
-    if (!src->has_tile())
-    {
-        const auto sampler = create_simple_sampler(writer);
-        writer->op_load_once(src, sampler);
-    }
-
-    auto       &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
-
-    // Prepare the output tile.
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-
-    auto &dst_tile = dst->tile();
-
-    // Perform the operation.
-    writer->op_unary_elementwise_function(dst_tile, UnaryFunction::Exp, src_tile);
-}
-
-void op_store(ExampleScopedKernelWriter writer, std::vector<ExampleComponentArgument *> operands)
-{
-    auto src = operands.at(0);
-    auto dst = operands.at(1);
-
-    auto       &src_tile   = src->tile();
-    const auto &sampler    = src->tile_sampler();
-    auto       &dst_tensor = dst->tensor();
-
-    writer->op_store(dst_tensor, src_tile, sampler);
-}
-
-int main()
-{
-    Kernel              kernel("example", GpuTargetLanguage::OpenCL);
-    ExampleKernelWriter root_writer(kernel);
-
-    ExampleScopedKernelWriter writer(&root_writer);
-
-    const TensorInfo src0_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 0);
-    const TensorInfo src1_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 1);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({3, 10, 20, 1, 1}), TensorDataLayout::Nhwc, 2);
-
-    ExampleComponentArgument src0(
-        writer->declare_tensor_argument("src0", src0_info, TensorStorageType::BufferUint8Ptr));
-    ExampleComponentArgument src1(
-        writer->declare_tensor_argument("src1", src1_info, TensorStorageType::BufferUint8Ptr));
-    ExampleComponentArgument dst(writer->declare_tensor_argument("dst", dst_info, TensorStorageType::BufferUint8Ptr));
-
-    ExampleComponentArgument ans;
-
-    op_binary_elementwise(writer, {&src0, &src1, &ans});
-    op_exp(writer, {&ans, &ans});
-    op_store(writer, {&ans, &dst});
-
-    const auto arguments = kernel.arguments();
-
-    std::cout << "\n====================\nArguments:\n====================\n";
-
-    for (auto &arg : arguments)
-    {
-        switch (arg.type())
-        {
-            case ckw::KernelArgument::Type::TensorStorage:
-                std::cout << "* Tensor storage:   ID = " << arg.id() << ", type = " << std::hex << "0x"
-                          << static_cast<uint32_t>(arg.tensor_storage_type()) << std::dec << "\n";
-                break;
-
-            case ckw::KernelArgument::Type::TensorComponent:
-                std::cout << "* Tensor component: ID = " << arg.id() << ", type = " << std::hex << "0x"
-                          << static_cast<uint32_t>(arg.tensor_component_type()) << std::dec << "\n";
-                break;
-
-            default:
-                CKW_ASSERT(false);
-        }
-    }
-
-    std::cout << "\n====================\nCode:\n====================\n";
-    const auto code = root_writer.generate_code();
-    std::cout << code;
-
-    return 0;
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
deleted file mode 100644
index 55223dae0e..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ExampleComponentArgument.h"
-
-#include "ckw/Error.h"
-
-ExampleComponentArgument::ExampleComponentArgument()
-{
-}
-
-ExampleComponentArgument::ExampleComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
-{
-}
-
-ExampleComponentArgument &ExampleComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
-                                                                        const ckw::TensorTileSampler &tile_sampler)
-{
-    CKW_ASSERT(_tile == nullptr);
-
-    _tile         = &tile;
-    _tile_sampler = tile_sampler;
-
-    return *this;
-}
-
-bool ExampleComponentArgument::has_tensor() const
-{
-    return _tensor != nullptr;
-}
-
-ckw::TensorOperand &ExampleComponentArgument::tensor()
-{
-    CKW_ASSERT(_tensor != nullptr);
-
-    return *_tensor;
-}
-
-const ckw::TensorOperand &ExampleComponentArgument::tensor() const
-{
-    CKW_ASSERT(_tensor != nullptr);
-
-    return *_tensor;
-}
-
-bool ExampleComponentArgument::has_tile() const
-{
-    return _tile != nullptr;
-}
-
-ckw::TileOperand &ExampleComponentArgument::tile()
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return *_tile;
-}
-
-const ckw::TileOperand &ExampleComponentArgument::tile() const
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return *_tile;
-}
-
-ckw::TensorTileSampler &ExampleComponentArgument::tile_sampler()
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return _tile_sampler;
-}
-
-const ckw::TensorTileSampler &ExampleComponentArgument::tile_sampler() const
-{
-    CKW_ASSERT(_tile != nullptr);
-
-    return _tile_sampler;
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h b/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
deleted file mode 100644
index 0e029b1157..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleComponentArgument.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
-#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
-
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class TensorOperand;
-
-class TileOperand;
-} // namespace ckw
-
-/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
-class ExampleComponentArgument
-{
-public:
-    /** Initialize a new instance of @ref ExampleComponentArgument class for empty virtual tensor. */
-    ExampleComponentArgument();
-
-    /** Initialize a new instance of @ref ExampleComponentArgument class for user tensor.
-     *
-     * @param[in] tensor The user tensor.
-     */
-    explicit ExampleComponentArgument(ckw::TensorOperand &tensor);
-
-    /** Set virtual tensor information (tile, sampler) for the argument.
-     *
-     * If the component is a user tensor, it can be treated as virtual tensor as well
-     * and won't be loaded again using @ref ExampleKernelWriter::op_load_once method.
-     *
-     * @param[in] tile    The tile that has been loaded.
-     * @param[in] sampler The tensor sampling information that has been used to load the tile.
-     */
-    ExampleComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
-
-    /** Get whether the argument is a user tensor. */
-    bool has_tensor() const;
-
-    /** Get the tensor operand.
-     *
-     * If the tensor is not available, throw an error.
-     */
-    ckw::TensorOperand &tensor();
-
-    /** Get the tensor operand.
-     *
-     * If the tensor is not available, throw an error.
-     */
-    const ckw::TensorOperand &tensor() const;
-
-    /** Get whether the argument contains a tile.
-     *
-     * The argument can be either a user tensor that has been loaded,
-     * or a virtual tensor (i.e. a tile with tensor sampling information).
-     */
-    bool has_tile() const;
-
-    /** Get the tile operand.
-     *
-     * If the tile is not available, throw an error.
-     */
-    ckw::TileOperand &tile();
-
-    /** Get the tile operand.
-     *
-     * If the tile is not available, throw an error.
-     */
-    const ckw::TileOperand &tile() const;
-
-    /** Get the tensor sampling information for the tile.
-     *
-     * If the tile is not available, throw an error.
-     */
-    ckw::TensorTileSampler &tile_sampler();
-
-    /** Get the tensor sampling information for the tile.
-     *
-     * If the tile is not available, throw an error.
-     */
-    const ckw::TensorTileSampler &tile_sampler() const;
-
-private:
-    ckw::TensorOperand    *_tensor{nullptr};
-    ckw::TileOperand      *_tile{nullptr};
-    ckw::TensorTileSampler _tile_sampler{};
-};
-
-#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLECOMPONENTARGUMENT_H
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
deleted file mode 100644
index 1734ce8823..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ExampleKernelWriter.h"
-
-#include "ckw/Error.h"
-#include "ckw/TileInfo.h"
-
-#include "ExampleComponentArgument.h"
-
-ExampleKernelWriter::ExampleKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
-{
-}
-
-void ExampleKernelWriter::op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
-{
-    if (!tensor_or_tile->has_tile())
-    {
-        CKW_ASSERT(tensor_or_tile->has_tensor());
-
-        auto &tensor = tensor_or_tile->tensor();
-
-        const auto tile_name = tensor.name() + "_tile";
-        auto      &tile =
-            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
-
-        op_load(tile, tensor, sampler);
-
-        tensor_or_tile->init_virtual_tensor(tile, sampler);
-    }
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h b/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
deleted file mode 100644
index 1528c3d933..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleKernelWriter.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
-#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-class ExampleComponentArgument;
-
-namespace ckw
-{
-class Kernel;
-} // namespace ckw
-
-/** Extended implementation of kernel writer for dynamic fusion. */
-class ExampleKernelWriter : public ckw::KernelWriter
-{
-public:
-    /** Initialize a new instance of @ref ExampleKernelWriter class.
-     *
-     * @param[in] kernel The kernel to be generated.
-     */
-    explicit ExampleKernelWriter(ckw::Kernel &kernel);
-
-    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
-     *
-     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
-     * @param[in] sampler        The tensor sampling information to load the tile.
-     */
-    void op_load_once(ExampleComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
-};
-
-#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLEKERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
deleted file mode 100644
index 784d5ffb96..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ExampleScopedKernelWriter.h"
-
-#include "ExampleKernelWriter.h"
-
-ExampleScopedKernelWriter::ExampleScopedKernelWriter(ExampleKernelWriter *writer)
-    : _writer(writer), _parent_id_space(writer->id_space())
-{
-    _writer->next_id_space();
-}
-
-ExampleScopedKernelWriter::ExampleScopedKernelWriter(const ExampleScopedKernelWriter &other)
-    : _writer(other._writer), _parent_id_space(other._writer->id_space())
-{
-    _writer->next_id_space();
-}
-
-ExampleKernelWriter *ExampleScopedKernelWriter::operator->()
-{
-    return _writer;
-}
-
-const ExampleKernelWriter *ExampleScopedKernelWriter::operator->() const
-{
-    return _writer;
-}
-
-ExampleKernelWriter *ExampleScopedKernelWriter::writer()
-{
-    return _writer;
-}
-
-const ExampleKernelWriter *ExampleScopedKernelWriter::writer() const
-{
-    return _writer;
-}
diff --git a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h b/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
deleted file mode 100644
index 4655b1897e..0000000000
--- a/compute_kernel_writer/prototype/examples/common/ExampleScopedKernelWriter.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
-#define CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
-
-#include <cstdint>
-
-class ExampleKernelWriter;
-
-/** Helper to automatically manage kernel writer ID space. */
-class ExampleScopedKernelWriter
-{
-public:
-    /** Initialize a new instance of @ref ExampleScopedKernelWriter class. */
-    explicit ExampleScopedKernelWriter(ExampleKernelWriter *writer);
-
-    /** Create a new scope from the specified scoped kernel writer. */
-    ExampleScopedKernelWriter(const ExampleScopedKernelWriter &other);
-
-    /** Assignment is disallowed. */
-    ExampleScopedKernelWriter &operator=(const ExampleScopedKernelWriter &) = delete;
-
-    /** Access the underlying kernel writer. */
-    ExampleKernelWriter *operator->();
-
-    /** Access the underlying kernel writer. */
-    const ExampleKernelWriter *operator->() const;
-
-    /** Get the kernel writer. */
-    ExampleKernelWriter *writer();
-
-    /** Get the kernel writer. */
-    const ExampleKernelWriter *writer() const;
-
-private:
-    ExampleKernelWriter *_writer;
-    int32_t              _parent_id_space;
-};
-
-#endif // CKW_PROTOTYPE_EXAMPLES_COMMON_EXAMPLESCOPEDKERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/examples/writer_helper.cpp b/compute_kernel_writer/prototype/examples/writer_helper.cpp
deleted file mode 100644
index 8623afbf50..0000000000
--- a/compute_kernel_writer/prototype/examples/writer_helper.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-#include "../include/ckw/KernelWriterHelper.h"
-#include <iostream>
-
-using namespace ckw;
-
-TensorTileSampler create_simple_sampler(KernelWriter &writer)
-{
-    TensorTileSampler sampler;
-
-    constexpr int32_t m0 = 1;
-    constexpr int32_t n0 = 1;
-
-    auto &gid_0 = writer.declare_tile("gid_0", DataType::Int32);
-    auto &gid_1 = writer.declare_tile("gid_1", DataType::Int32);
-    auto &gid_2 = writer.declare_tile("gid_2", DataType::Int32);
-
-    auto &const_0 = writer.declare_tile("0", 0);
-
-    writer.op_get_global_id(gid_0, 0);
-    writer.op_get_global_id(gid_1, 1);
-    writer.op_get_global_id(gid_2, 2);
-
-    sampler.x(gid_0);
-    sampler.y(gid_1);
-    sampler.z(gid_2);
-    sampler.b(const_0);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1);
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-
-    return sampler;
-}
-
-int main()
-{
-    Kernel                           kernel("test", GpuTargetLanguage::OpenCL);
-    KernelWriterHelper<KernelWriter> writer(kernel);
-
-    const TensorInfo src_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 0);
-    const TensorInfo dst_info(DataType::Fp32, TensorShape({1, 1, 1, 1, 1}), TensorDataLayout::Nhwc, 1);
-
-    auto &src_tensor = writer.declare_tensor_argument("src", src_info);
-    auto &dst_tensor = writer.declare_tensor_argument("dst", dst_info);
-
-    const auto sampler = create_simple_sampler(writer);
-
-    auto &src = writer.declare_tile("src_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-    auto &other =
-        writer.declare_tile("other_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-    auto &dst = writer.declare_tile("dst_tile", TileInfo(src_tensor.data_type(), sampler.height(), sampler.width()));
-
-    writer.op_load(src, src_tensor, sampler);
-    writer.op_load(other, src_tensor, sampler);
-    writer.op_load(dst, dst_tensor, sampler);
-
-    auto test       = dst ^ src ^ other;
-    auto other_test = logical_and(dst, src, other);
-    writer.op_assign(dst, logical_and(dst, src, other));
-    writer.op_assign(dst, test);
-    writer.op_assign(dst, other_test);
-    writer.op_assign(dst, operator^(operator^(dst, src), other));
-
-    writer.op_if(exp(src) == dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
-        .op_else_if(exp(src) > dst, [&] { writer.op_binary_expression(dst, src, BinaryOp::Add, src); })
-        .op_else([&] { writer.op_assign(dst, src); });
-
-    writer.op_assign(dst, src + src * src);
-    writer.op_assign(dst, src * max(src, dst) + src);
-    writer.op_assign(dst, src * select(src, dst, src) + src);
-
-    writer.op_assign(dst, src ^ dst);
-    writer.op_assign(dst, ~src);
-
-    writer.op_for_loop(dst < src, dst += src, [&] { writer.op_assign(dst, src + dst); });
-
-    writer.op_assign(dst += src);
-    writer.op_assign(dst += exp(src));
-
-    std::cout << "======== KERNEL ========" << std::endl;
-    std::cout << writer.generate_code() << std::endl;
-}
diff --git a/compute_kernel_writer/prototype/include/ckw/Error.h b/compute_kernel_writer/prototype/include/ckw/Error.h
deleted file mode 100644
index aab713c817..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/Error.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_ERROR_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_ERROR_H
-
-#include <stdexcept>
-#include <string>
-
-namespace ckw
-{
-
-/** If the condition is not met, throw an std::runtime_error with the specified message.
- *
- * @param[in] cond The condition that is expected to be true.
- * @param[in] msg  The error message when the condition is not met.
- */
-#define CKW_ASSERT_MSG(cond, msg)            \
-    do                                       \
-    {                                        \
-        if (!(cond))                         \
-        {                                    \
-            throw ::std::runtime_error(msg); \
-        }                                    \
-    } while (false)
-
-/** If the condition is not met, throw an std::runtime_error.
- *
- * @param[in] cond The condition that is expected to be true.
- */
-#define CKW_ASSERT(cond) CKW_ASSERT_MSG(cond, #cond)
-
-/** If the precondition is met but the consequence is not met, throw an std::runtime_error.
- *
- * @param[in] precond The condition if is met requires the consequence must also be met.
- * @param[in] cond    The condition that is expected to be true if the precondition is true.
- */
-#define CKW_ASSERT_IF(precond, cond) CKW_ASSERT_MSG(!(precond) || ((precond) && (cond)), #precond " |-> " #cond)
-
-/** Mark the variables as unused.
- *
- * @param[in] ... Variables which are unused.
- */
-#define CKW_UNUSED(...) ::ckw::ignore_unused(__VA_ARGS__) // NOLINT
-
-/** Mark the variables as unused.
- *
- * @param[in] ... Variables which are unused.
- */
-template <typename... T>
-inline void ignore_unused(T &&...)
-{
-}
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_ERROR_H
diff --git a/compute_kernel_writer/prototype/include/ckw/Kernel.h b/compute_kernel_writer/prototype/include/ckw/Kernel.h
deleted file mode 100644
index ba31a29ba7..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/Kernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
-
-#include "ckw/KernelArgument.h"
-#include "ckw/OperandBase.h"
-#include "ckw/types/GpuTargetLanguage.h"
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace ckw
-{
-
-class TileOperand;
-
-namespace prototype
-{
-class GpuKernelWriterDataHolder;
-} // namespace prototype
-
-/** The target for kernel writer to write into. */
-class Kernel
-{
-public:
-    /** Constructor
-     *
-     * @param[in] language The programming language to write the kernel.
-     */
-    Kernel(GpuTargetLanguage language);
-    /** Constructor
-     *
-     * @param[in] name     The name of the kernel function.
-     * @param[in] language The programming language to write the kernel.
-     */
-    Kernel(const char *name, GpuTargetLanguage language);
-
-    /** Destructor */
-    ~Kernel();
-
-    /** Get the name of the kernel function. */
-    const std::string &name() const;
-
-    /** Set the name of the kernel function.
-     *
-     * @param[in] name     The name of the kernel function.
-     */
-    void name(const std::string &name);
-
-    /** Get the list of kernel arguments. */
-    ::std::vector<KernelArgument> arguments() const;
-
-    /** (Internal use only) Register the tile operand.
-     *
-     * @param operand The tile operand to be registered.
-     */
-    TileOperand &register_operand(::std::unique_ptr<TileOperand> operand);
-
-    /** (Internal use only) Register the tensor operand.
-     *
-     * @param operand The tensor operand to be registered.
-     */
-    TensorOperand &register_operand(::std::unique_ptr<TensorOperand> operand);
-
-    /** (Internal use only) Get the implementation data. */
-    prototype::GpuKernelWriterDataHolder *impl();
-
-private:
-    ::std::string                                             _name;
-    ::std::unique_ptr<prototype::GpuKernelWriterDataHolder>   _kernel;
-    ::std::map<::std::string, ::std::unique_ptr<OperandBase>> _operands;
-    ::std::map<int32_t, TensorOperand *>                      _tensor_id_operands;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNEL_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h b/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
deleted file mode 100644
index 3384a20aef..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/KernelArgument.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
-
-#include "ckw/TensorInfo.h"
-
-#include <cstdint>
-
-namespace ckw
-{
-
-class TensorOperand;
-class TensorComponentOperand;
-
-/** A kernel argument which can be either a tensor storage or a tensor component. */
-class KernelArgument
-{
-public:
-    /** The type of kernel argument. */
-    enum class Type : int32_t
-    {
-        /** The argument that provides the read and/or write access to the tensor data.
-         *
-         * See @ref ckw::TensorStorage to see the list of supported storage type.
-         */
-        TensorStorage,
-
-        /** The argument that provides extra information about the tensor.
-         *
-         * See @ref ckw::TensorComponent to see the list of supported component.
-         */
-        TensorComponent,
-    };
-
-    /** Initialize a new instance of kernel argument class for a tensor storage argument.
-     *
-     * @param[in] tensor The tensor whose storage is exposed to kernel arguments.
-     */
-    KernelArgument(TensorOperand &tensor);
-
-    /** Initialize a new instance of kernel argument class for a tensor component argument.
-     *
-     * @param[in] tensor_component The tensor component to be exposed to kernel arguments.
-     */
-    KernelArgument(TensorComponentOperand &tensor_component);
-
-    /** Get the type of kernel argument. */
-    Type type() const;
-
-    /** Get the argument ID.
-     *
-     * This method can be used to get the tensor info ID of both tensor storage and tensor component arguments.
-     */
-    int32_t id() const;
-
-    /** Get the type of tensor storage.
-     *
-     * This method can only be used for tensor storage argument.
-     */
-    TensorStorageType tensor_storage_type() const;
-
-    /** Get the tensor component type.
-     *
-     * This method can only be used for tensor component argument.
-     */
-    TensorComponentType tensor_component_type() const;
-
-private:
-    Type    _type;
-    int32_t _id;
-
-    union SubId
-    {
-        int32_t             unknown;
-        TensorStorageType   tensor_storage_type;
-        TensorComponentType tensor_component_type;
-    };
-
-    SubId _sub_id{0};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNELARGUMENT_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
deleted file mode 100644
index f9e0066f91..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
-
-#include "ckw/Kernel.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TileInfo.h"
-#include "ckw/TileOperand.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/Operators.h"
-
-#include <memory>
-
-namespace ckw
-{
-
-namespace prototype
-{
-struct GpuKernelWriterAttribute;
-
-class IGpuKernelWriter;
-} // namespace prototype
-
-/** Kernel writer. */
-class KernelWriter
-{
-public:
-    // =============================================================================================
-    // Constructors and destructor
-    // =============================================================================================
-
-    /** Initialize a new instance of kernel writer.
-     *
-     * @param[in] kernel The kernel to be written to.
-     */
-    explicit KernelWriter(Kernel &kernel);
-
-    /** Destructor */
-    ~KernelWriter();
-
-    /** No copy constructor. */
-    KernelWriter(const KernelWriter &) = delete;
-
-    /** No copy assignment. */
-    KernelWriter &operator=(const KernelWriter &) = delete;
-
-    // =============================================================================================
-    // Scope management
-    // =============================================================================================
-
-    /** Get the current ID space. */
-    int32_t id_space() const;
-
-    /** Set the current ID space. */
-    KernelWriter &id_space(int32_t id_space);
-
-    /** Switch to and return a new ID space. */
-    int32_t next_id_space();
-
-    // =============================================================================================
-    // Tensor and tile declaration
-    // =============================================================================================
-
-    /** Declare a tensor argument.
-     *
-     * @param[in] name         The name of the tensor.
-     * @param[in] info         The tensor info.
-     * @param[in] storage_type The tensor storage type.
-     *
-     * @return The @ref TensorOperand object.
-     */
-    TensorOperand &declare_tensor_argument(const std::string &name,
-                                           const TensorInfo  &info,
-                                           TensorStorageType  storage_type = TensorStorageType::BufferUint8Ptr);
-
-    /** Declare a compile-time constant scalar argument.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     *
-     * @return The @ref TileOperand object.
-     */
-    TileOperand &declare_tile_argument(const std::string &name, int32_t value);
-
-    /** Declare a new tile.
-     *
-     * The name of the tile must be unique in the current ID space.
-     *
-     * @param[in] name The name of the tile.
-     * @param[in] ...  The necessary arguments to create a new @ref TileOperand.
-     *
-     * @return The @ref TileOperand object.
-     */
-    template <typename... TArgs>
-    TileOperand &declare_tile(const std::string &name, TArgs &&...args)
-    {
-        const auto var_name = generate_variable_name(name);
-        auto       operand  = std::make_unique<TileOperand>(var_name, ::std::forward<TArgs>(args)...);
-
-        return declare_tile_operand(std::move(operand));
-    }
-
-    // =============================================================================================
-    // Load and store
-    // =============================================================================================
-
-    /** Load the data from the tensor memory to the tile using the sampling information.
-     *
-     * @param[out] tile       The tile to be loaded.
-     * @param[in]  tensor     The tensor to be read.
-     * @param[in]  sampler    The tensor sampling information.
-     * @param[in]  dilation_y Dilation in the Y dimension.
-     */
-    void op_load(TileOperand             &tile,
-                 const TensorOperand     &tensor,
-                 const TensorTileSampler &sampler,
-                 const TileOperand       &dilation_y = TileOperand("dil_y", 1));
-
-    /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
-     *
-     * @param[out] tile    The tile to be loaded.
-     * @param[in]  tensor  The tensor to be read.
-     * @param[in]  sampler The tensor sampling information.
-     */
-    void op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler);
-
-    /** Construct an indirection buffer in @p tile containing the precalculated addresses of elements in the source tensor.
-     *
-     * @param[out] tile    The tile to be loaded.
-     * @param[in]  tensor  The tensor the be read.
-     * @param[in]  sampler The tensor sampling information.
-     * @param[in]  x       The X coordinate.
-     * @param[in]  y       The Y coordinate.
-     * @param[in]  x_off   Offset in the X dimension.
-     * @param[in]  y_off   Offset in the Y dimension.
-     */
-    void util_get_indirect_buffer(TileOperand             &tile,
-                                  const TensorOperand     &tensor,
-                                  const TensorTileSampler &sampler,
-                                  const TileOperand       &x,
-                                  const TileOperand       &y,
-                                  const TileOperand       &x_off,
-                                  const TileOperand       &y_off);
-
-    /** Store the tile to the tensor using the specified sampling information.
-     *
-     * @param[out] dst     The tensor that the tile is written to.
-     * @param[in]  src     The tile to be stored.
-     * @param[in]  sampler The tensor sampling information.
-     */
-    void op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler);
-
-    // =============================================================================================
-    // Data processing
-    // =============================================================================================
-
-    /** Write assignment: `<dst> = <src>;`.
-     *
-     * @param[out] dst The destination tile.
-     * @param[in]  src The source tile.
-     */
-    void op_assign(const TileOperand &dst, const TileOperand &src);
-
-    /** Write the cast: `<dst> = convert_<dst.type><_sat>(<src>);`.
-     *
-     * @param[out] dst      The destination tile.
-     * @param[in]  src      The source tile.
-     * @param[in]  policy   The policy governing the behavior of the cast.
-     */
-    void op_cast_expression(const TileOperand &dst, const TileOperand &src, ConvertPolicy policy);
-
-    /** Write the unary expression: `<dst> = <op> <src>`.
-     *
-     * @param[out]  dst The destination tile.
-     * @param[in]   op  The unary operator.
-     * @param[in]   src The source tile.
-     */
-    void op_unary_expression(const TileOperand &dst, UnaryOp op, const TileOperand &src);
-
-    /** Write binary expression: `<dst> = <lhs> <op> <rhs>;`.
-     *
-     * @param[out] dst  The destination tile.
-     * @param[in]  lhs  The LHS tile.
-     * @param[in]  op   The binary operator.
-     * @param[in]  rhs  The RHS tile.
-     */
-    void op_binary_expression(const TileOperand &dst, const TileOperand &lhs, BinaryOp op, const TileOperand &rhs);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<src>);`.
-     *
-     * @param[out] dst  The destination tile.
-     * @param[in]  func The function to be applied to the source tile.
-     * @param[in]  src  The source tile.
-     */
-    void op_unary_elementwise_function(const TileOperand &dst, UnaryFunction func, const TileOperand &src);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<first>, <second>);`.
-     *
-     * @param[out] dst      The destination tile.
-     * @param[in]  func     The function to be applied to the source tiles.
-     * @param[in]  first    The first argument tile.
-     * @param[in]  second   The second argument tile.
-     */
-    void op_binary_elementwise_function(const TileOperand &dst,
-                                        BinaryFunction     func,
-                                        const TileOperand &first,
-                                        const TileOperand &second);
-
-    /** Write function applied to scalar value: `<dst> = <func>(<first>, <second>, <third>);`.
-     *
-     * @param[out] dst      The destination tile.
-     * @param[in]  func     The function to be applied to the source tiles.
-     * @param[in]  first    The first argument tile.
-     * @param[in]  second   The second argument tile.
-     * @param[in]  third    The third argument tile.
-     */
-    void op_ternary_elementwise_function(const TileOperand &dst,
-                                         TernaryFunction    func,
-                                         const TileOperand &first,
-                                         const TileOperand &second,
-                                         const TileOperand &third);
-
-    /** Write if-statement: `if(<lhs> <op> <rhs>) { <body> }`.
-     *
-     * @param[in] lhs   The LHS tile of the condition.
-     * @param[in] op    The relational binary operator.
-     * @param[in] rhs   The RHS tile of the condition.
-     * @param[in] body  The body of the if-statement.
-     */
-    void op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body);
-
-    /** Write else-if-statement: `else if(<lhs> <op> <rhs>) { <body> }`.
-     *
-     * @param[in] lhs   The LHS tile of the condition.
-     * @param[in] op    The relational binary operator.
-     * @param[in] rhs   The RHS tile of the condition.
-     * @param[in] body  The body of the else-if-statement.
-     */
-    void op_else_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body);
-
-    /** Write an else-statement: `else { <body> }`.
-     *
-     * @param[in] body The body of the else-statement.
-     */
-    void op_else(const std::function<void()> &body);
-
-    /** Write for-loops: `for(; <var> <cond_op> <cond_value>; <var> <update_op> <update_value>) { body }`.
-     *
-     * @param[in]       var_name          The name of the variable used in condition.
-     * @param[in]       cond_op           The relational binary operator used in condition.
-     * @param[in]       cond_value_name   The value which the variable is compared against.
-     * @param[in]       update_var_name   The name of the variable which is updated.
-     * @param[in]       update_op         The assignment operator used for updating the update value.
-     * @param[in, out]  update_value      The value which is updated at every iteration.
-     * @param[in]       body              The body of the for-loop.
-     */
-    void op_for_loop(const TileOperand           &var_name,
-                     BinaryOp                     cond_op,
-                     const TileOperand           &cond_value_name,
-                     const TileOperand           &update_var_name,
-                     AssignmentOp                 update_op,
-                     const TileOperand           &update_value_name,
-                     const std::function<void()> &body);
-
-    /** Write the return statement: `return;`
-     */
-    void op_return();
-
-    // =============================================================================================
-    // Misc
-    // =============================================================================================
-
-    /** Set `dst` the global ID of dimension `dim`.
-     *
-     * @param[out] dst The tile to be written to.
-     * @param[in]  dim The global ID dimension.
-     */
-    void op_get_global_id(const TileOperand &dst, int32_t dim);
-
-    // =============================================================================================
-    // Code generation
-    // =============================================================================================
-
-    /** Generate the source code of the kernel. */
-    ::std::string generate_code();
-
-private:
-    /** Generate the full variable name based on the original name and the ID space.
-     *
-     * @param[in] name The name of the variable.
-     *
-     * @return The full variable name.
-     */
-    ::std::string generate_variable_name(const std::string &name) const;
-
-    /** Declare the tile operand.
-     *
-     * @param[in] operand   The tile operand to be declared.
-     */
-    TileOperand &declare_tile_operand(std::unique_ptr<TileOperand> operand);
-
-private:
-    Kernel                                                *_kernel;
-    ::std::unique_ptr<prototype::GpuKernelWriterAttribute> _impl_attr;
-    ::std::unique_ptr<prototype::IGpuKernelWriter>         _impl;
-
-    int32_t _id_space{0};
-    int32_t _max_id_space{0};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_KERNELWRITER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h b/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
deleted file mode 100644
index 3ba079bbc2..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriterHelper.h
+++ /dev/null
@@ -1,1286 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_KERNELWRITERHELPER_H
-#define CKW_INCLUDE_CKW_KERNELWRITERHELPER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorOperand.h"
-#include "ckw/TileOperand.h"
-
-#include <iostream>
-#include <type_traits>
-
-/*
- * By including this header file you will be able to supplement the default
- * Compute Kernel Writer API with additional syntax to help ease the use of CKW.
- *
- * To use the KernelWriterHelper you need to wrap your instance of KernelWriter
- * (or any class deriving from KernelWriter):
- *      KernelWriterHelper<KernelWriter> writer;
- * The resulting writer object comprises the original KernelWriter
- * functionality (drop-in replacement), but extends the syntax as follows.
- *
- * Common functions/operators have natural syntax:
- *  1. Unary expressions:
- *          writer.op_assign(dst, !src);        // Logical NOT
- *          writer.op_assign(dst, ~src);        // Bitwise NOT
- *
- *  2. Binary expressions:
- *          writer.op_assign(dst, lhs + rhs);   // Addition
- *          writer.op_assign(dst, lhs - rhs);   // Subtraction
- *          writer.op_assign(dst, lhs * rhs);   // Multiplication
- *          writer.op_assign(dst, lhs / rhs);   // Division
- *          writer.op_assign(dst, lhs % rhs);   // Modulo
- *          writer.op_assign(dst, lhs == rhs);  // Equality
- *          writer.op_assign(dst, lhs < rhs);   // Less-than
- *          writer.op_assign(dst, lhs <= rhs);  // Less-than-or-equal
- *          writer.op_assign(dst, lhs > rhs);   // Greater-than
- *          writer.op_assign(dst, lhs >= rhs);  // Greater-than-or-equal
- *          writer.op_assign(dst, lhs ^ rhs);   // Bitwise XOR
- *          writer.op_assign(dst, logical_and(lhs, rhs));  // Logical AND
- *          writer.op_assign(dst, logical_or(lhs, rhs));   // Logical OR
- *
- *  3. Unary elementwise functions:
- *          writer.op_assign(dst, exp(src));    // Exponent
- *          writer.op_assign(dst, tanh(src));   // Hyperbolic tangent
- *          writer.op_assign(dst, sqrt(src));   // Square root
- *          writer.op_assign(dst, erf(src));    // Error function
- *          writer.op_assign(dst, fabs(src));   // Absolute of floating-point number
- *          writer.op_assign(dst, log(src));    // Natural logarithm
- *          writer.op_assign(dst, round(src));  // Round
- *          writer.op_assign(dst, sizeOf(src)); // sizeof
- *
- *  4. Binary elementwise functions:
- *          writer.op_assign(dst, max(first, second));      // Max
- *          writer.op_assign(dst, min(first, second));      // Min
- *
- *  5. Ternary elementwise functions:
- *          writer.op_assign(dst, select(first, second, third));    // Select
- *
- * NOTE: All the above examples support nesting, so you could write
- * something like: writer.op_assign(dst, src * (log(arg) + sqrt(abs(arg)));
- *
- *
- *  6. If-statements. The preceding syntax also allows easier writing of if-statements:
- *          writer.op_if(<cond>, <body>);
- *
- *     For example:
- *          writer.op_if(exp(first_arg) == dst, [&]{
- *              //...
- *          }).op_else_if(exp(first_arg) > dst, [&]{
- *              //...
- *          }).op_else([&] {
- *              //...
- *          });
- *
- *  7. For-loops. A similar syntax exists for for-loops:
- *          writer.op_for_loop(<cond>, <updater>, <body>);
- *
- *     For example:
- *          writer.op_for_loop(index < limit, index += step, [&]{
- *              //...
- *          });
- *
- * NOTE: There are limitations on the for-loop <cond> and <updater> parameters.
- * In neither the <cond> (Binary expression) or <updater> (Increment/Decrement)
- * is it allowed to use nesting. For example, `(index + other) < limit` and
- * `index < round(limit)` are invalid <cond> parameters. This is because the
- * semantics of for-loops rely on the condition being evaluated at every iteration,
- * but as temporary variables might be defined for nested expressions the semantics
- * cannot be guaranteed.
- */
-
-namespace ckw
-{
-
-// ==================================================
-// Type traits
-// ==================================================
-
-/** Specifies if the type can be used as an operand for functions (e.g. max), operations (e.g. *), or assignments. */
-template <typename T>
-struct can_be_operand : ::std::false_type
-{
-};
-
-/** Specifies if the type can be assigned/written to. */
-template <typename T>
-struct can_be_assigned : ::std::false_type
-{
-};
-
-template <>
-struct can_be_operand<TileOperand &> : ::std::true_type
-{
-};
-
-template <>
-struct can_be_assigned<TileOperand &> : ::std::true_type
-{
-};
-
-// ==================================================
-// Assignment
-// ==================================================
-
-/** AST node for assignments.
- *
- * Note that \p TRight must be an operand, and \p TLeft must be assignable.
- *
- * @tparam TLeft The type of the destination of the assignment.
- * @tparam TRight The type of the source assigned to the destination.
- */
-template <typename TLeft,
-          typename TRight,
-          typename = ::std::enable_if<can_be_operand<TRight>::value && can_be_assigned<TLeft>::value>>
-struct Assignment
-{
-    TLeft        lhs;
-    TRight       rhs;
-    AssignmentOp opcode;
-};
-
-/** Represents the expression: `\p lhs += \p rhs`.
- *
- * @tparam      TLeft    The type of the LHS of the assignment.
- * @tparam      TRight   The type of the RHS of the assignment.
- * @param[in]   lhs      The LHS of the assignment.
- * @param[in]   rhs      The RHS of the assignment.
- * @return      The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline Assignment<TLeft, TRight> operator+=(TLeft &&lhs, TRight &&rhs)
-{
-    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Increment};
-}
-
-/** Represents the expression: `\p lhs -= \p rhs`.
- *
- * @tparam      TLeft    The type of the LHS of the assignment.
- * @tparam      TRight   The type of the RHS of the assignment.
- * @param[in]   lhs    The LHS of the assignment.
- * @param[in]   rhs    The RHS of the assignment.
- * @return      The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline Assignment<TLeft, TRight> operator-=(TLeft &&lhs, TRight &&rhs)
-{
-    return Assignment<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), AssignmentOp::Decrement};
-}
-
-// ==================================================
-// Unary expression
-// ==================================================
-
-/** AST node for unary expressions.
- *
- * Note that \p TSrc must be an operand.
- *
- * @tparam TSrc The type of the argument to the expression.
- */
-template <typename TSrc, typename = ::std::enable_if<can_be_operand<TSrc>::value>>
-struct UnaryExpression
-{
-    TSrc    src;
-    UnaryOp opcode;
-};
-
-template <typename TLeft>
-struct can_be_operand<UnaryExpression<TLeft>> : ::std::true_type
-{
-};
-
-/** Represents the expression: `!\p src`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-inline UnaryExpression<TSrc> operator!(TSrc &&src)
-{
-    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::LogicalNot};
-}
-
-/** Represents the expression: `~\p src`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-inline UnaryExpression<TSrc> operator~(TSrc &&src)
-{
-    return UnaryExpression<TSrc>{std::forward<TSrc>(src), UnaryOp::BitwiseNot};
-}
-
-// ==================================================
-// Binary expressions
-// ==================================================
-
-/** AST node for binary expressions.
- *
- * Note that both \p TLeft and \p TRight must be operands.
- *
- * @tparam TLeft  The type of the left argument of the expression.
- * @tparam TRight The type of the right argument of the expression.
- */
-template <typename TLeft,
-          typename TRight,
-          typename = ::std::enable_if_t<can_be_operand<TLeft>::value && can_be_operand<TRight>::value>>
-struct BinaryExpression
-{
-    TLeft    lhs;
-    TRight   rhs;
-    BinaryOp opcode;
-};
-
-template <typename TLeft, typename TRight>
-struct can_be_operand<BinaryExpression<TLeft, TRight>> : ::std::true_type
-{
-};
-
-/** Represents the expression: `\p lhs + \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator+(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Add};
-}
-
-/** Represents the expression: `\p lhs - \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator-(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Sub};
-}
-
-/** Represents the expression: `\p lhs * \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator*(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mul};
-}
-
-/** Represents the expression: `\p lhs / \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator/(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Div};
-}
-
-/** Represents the expression: `\p lhs % \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator%(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Mod};
-}
-
-/** Represents the expression: `\p lhs == \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator==(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Equal};
-}
-
-/** Represents the expression: `\p lhs < \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator<(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Less};
-}
-
-/** Represents the expression: `\p lhs <= \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator<=(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LessEqual};
-}
-
-/** Represents the expression: `\p lhs > \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator>(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::Greater};
-}
-
-/** Represents the expression: `\p lhs >= \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator>=(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::GreaterEqual};
-}
-
-/** Represents the expression: `\p lhs ^ \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> operator^(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::BitwiseXOR};
-}
-
-/** Represents the expression: `\p lhs && \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> logical_and(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd};
-}
-
-/** Represents the expression: `\p lhs && \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight, typename... TOps>
-inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_and(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
-{
-    return logical_and(
-        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalAnd},
-        std::forward<TOps>(ops)...);
-}
-
-/** Represents the expression: `\p lhs || \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight>
-inline BinaryExpression<TLeft, TRight> logical_or(TLeft &&lhs, TRight &&rhs)
-{
-    return BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr};
-}
-
-/** Represents the expression: `\p lhs || \p rhs`.
- *
- * @tparam     TLeft  The type of the LHS of the expression.
- * @tparam     TRight The type of the RHS of the expression.
- * @param[in]  lhs    The LHS of the expression.
- * @param[in]  rhs    The RHS of the expression.
- * @return     The resulting AST node.
- */
-template <typename TLeft, typename TRight, typename... TOps>
-inline BinaryExpression<BinaryExpression<TLeft, TRight>, TOps...> logical_or(TLeft &&lhs, TRight &&rhs, TOps &&...ops)
-{
-    return logical_or(
-        BinaryExpression<TLeft, TRight>{std::forward<TLeft>(lhs), std::forward<TRight>(rhs), BinaryOp::LogicalOr},
-        std::forward<TOps>(ops)...);
-}
-
-// ==================================================
-// Unary elementwise functions
-// ==================================================
-
-/** AST node for unary elementwise functions.
- *
- * Note that \p TSrc must be an operand.
- *
- * @tparam TSrc The type of the argument to the function.
- */
-template <typename TSrc, typename = ::std::enable_if<can_be_operand<TSrc>::value>>
-struct UnaryElementwiseFunction
-{
-    TSrc          src;
-    UnaryFunction opcode;
-};
-
-template <typename TLeft>
-struct can_be_operand<UnaryElementwiseFunction<TLeft>> : ::std::true_type
-{
-};
-
-/** Represents the expression: `exp(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> exp(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Exp};
-}
-
-/** Represents the expression: `tanh(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> tanh(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Tanh};
-}
-
-/** Represents the expression: `sqrt(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> sqrt(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Sqrt};
-}
-
-/** Represents the expression: `erf(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> erf(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Erf};
-}
-
-/** Represents the expression: `fabs(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> fabs(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Fabs};
-}
-
-/** Represents the expression: `log(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> log(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Log};
-}
-
-/** Represents the expression: `round(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> round(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::Round};
-}
-
-/** Represents the expression: `sizeof(\p src)`.
- *
- * @tparam      TSrc The type of the argument.
- * @param[in]   src  The argument.
- * @return      The resulting AST node.
- */
-template <typename TSrc>
-UnaryElementwiseFunction<TSrc> sizeOf(TSrc &&src)
-{
-    return UnaryElementwiseFunction<TSrc>{std::forward<TSrc>(src), UnaryFunction::SizeOf};
-}
-
-// ==================================================
-// Binary elementwise functions
-// ==================================================
-
-/** AST node for binary elementwise functions.
- *
- * Note that both \p TFirst and \p TSecond must be operands.
- *
- * @tparam TFirst  The type of the left argument of the function.
- * @tparam TSecond The type of the right argument of the function.
- */
-template <typename TFirst,
-          typename TSecond,
-          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value>>
-struct BinaryElementwiseFunction
-{
-    TFirst         first;
-    TSecond        second;
-    BinaryFunction opcode;
-};
-
-template <typename TFirst, typename TSecond>
-struct can_be_operand<BinaryElementwiseFunction<TFirst, TSecond>> : ::std::true_type
-{
-};
-
-/** Represents the function call: `max(\p first, \p second)`.
- *
- * @tparam      TFirst  The type of the first argument.
- * @tparam      TSecond The type of the second argument.
- * @param[in]   first   The first argument.
- * @param[in]   second  The second argument.
- * @return      The resulting AST node.
- */
-template <typename TFirst, typename TSecond>
-BinaryElementwiseFunction<TFirst, TSecond> max(TFirst &&first, TSecond &&second)
-{
-    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
-                                                      BinaryFunction::Max};
-}
-
-/** Represents the function call: `min(\p first, \p second)`.
- *
- * @tparam      TFirst  The type of the first argument.
- * @tparam      TSecond The type of the second argument.
- * @param[in]   first   The first argument.
- * @param[in]   second  The second argument.
- * @return      The resulting AST node.
- */
-template <typename TFirst, typename TSecond>
-BinaryElementwiseFunction<TFirst, TSecond> min(TFirst &&first, TSecond &&second)
-{
-    return BinaryElementwiseFunction<TFirst, TSecond>{std::forward<TFirst>(first), std::forward<TSecond>(second),
-                                                      BinaryFunction::Min};
-}
-
-// ==================================================
-// Ternary elementwise functions
-// ==================================================
-
-/** AST node for ternary elementwise functions.
- *
- * Note that \p TFirst, \p TSecond, and \p TThird all must be operands.
- *
- * @tparam TFirst The type of the first argument to the function.
- * @tparam TSecond The type of the second argument to the function.
- * @tparam TThird The type of the third argument to the function.
- */
-template <typename TFirst,
-          typename TSecond,
-          typename TThird,
-          typename = ::std::enable_if<can_be_operand<TFirst>::value && can_be_operand<TSecond>::value &&
-                                      can_be_operand<TThird>::value>>
-struct TernaryElementwiseFunction
-{
-    TFirst          first;
-    TSecond         second;
-    TThird          third;
-    TernaryFunction opcode;
-};
-
-template <typename TFirst, typename TSecond, typename TThird>
-struct can_be_operand<TernaryElementwiseFunction<TFirst, TSecond, TThird>> : ::std::true_type
-{
-};
-
-/** Represents the function call: `select(\p first, \p second, \p third)`.
- *
- * @tparam      TFirst  The type of the first argument.
- * @tparam      TSecond The type of the second argument.
- * @tparam      TThird  The type of the third argument.
- * @param[in]   first   The first argument.
- * @param[in]   second  The second argument.
- * @param[in]   third   The third argument.
- * @return      The resulting AST node.
- */
-template <typename TFirst, typename TSecond, typename TThird>
-TernaryElementwiseFunction<TFirst, TSecond, TThird> select(TFirst &&first, TSecond &&second, TThird &&third)
-{
-    return TernaryElementwiseFunction<TFirst, TSecond, TThird>{std::forward<TFirst>(first),
-                                                               std::forward<TSecond>(second),
-                                                               std::forward<TThird>(third), TernaryFunction::Select};
-}
-
-/** Helper class used to extend a KernelWriter with additional functionality
- * in order to make writing easier.
- *
- * This extension automatically handles creation of temporary variables, and
- * allows nested function calls and operations.
- *
- * @tparam TWriter The type of KernelWriter to be overloaded. This must inherit from KernelWriter.
- */
-template <class TWriter, typename = std::enable_if<std::is_base_of<KernelWriter, TWriter>::value>>
-class KernelWriterHelper : public TWriter
-{
-public:
-    using TWriter::TWriter;
-
-    // ==================================================
-    // If-statements
-    // ==================================================
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_if;
-
-    /** Represents the if-statement: `if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the if-statement.
-     */
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
-                                       const std::function<void()>                          &body)
-    {
-        TWriter::op_if(cond.lhs, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    /** Represents the if-statement: `if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the if-statement.
-     */
-    template <typename TRight>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TileOperand &, TRight> &cond,
-                                       const std::function<void()>                   &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
-        op_assign(tmp1, cond.rhs);
-        TWriter::op_if(cond.lhs, cond.opcode, tmp1, body);
-        return *this;
-    }
-
-    /** Represents the if-statement: `if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the if-statement.
-     */
-    template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_if(const BinaryExpression<TLeft, TileOperand &> &cond,
-                                       const std::function<void()>                  &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
-        op_assign(tmp1, cond.lhs);
-        TWriter::op_if(tmp1, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_else_if;
-
-    /** Represents the else-if-statement: `else if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the else-if-statement.
-     */
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TileOperand &> &cond,
-                                            const std::function<void()>                          &body)
-    {
-        TWriter::op_else_if(cond.lhs, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    /** Represents the else-if-statement: `else if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the else-if-statement.
-     */
-    template <typename TRight>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TileOperand &, TRight> &cond,
-                                            const std::function<void()>                   &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.lhs.tile_info());
-        op_assign(tmp1, cond.rhs);
-        TWriter::op_else_if(cond.lhs, cond.opcode, tmp1, body);
-        return *this;
-    }
-
-    /** Represents the else-if-statement: `else if(\p cond) { \p body }`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond The BinaryExpression representing the condition.
-     * @param[in] body The body of the else-if-statement.
-     */
-    template <typename TLeft>
-    KernelWriterHelper<TWriter> &op_else_if(const BinaryExpression<TLeft, TileOperand &> &cond,
-                                            const std::function<void()>                  &body)
-    {
-        auto &tmp1 = declare_temp_tile(cond.rhs.tile_info());
-        op_assign(tmp1, cond.lhs);
-        TWriter::op_else_if(tmp1, cond.opcode, cond.rhs, body);
-        return *this;
-    }
-
-    // ==================================================
-    // For-loops
-    // ==================================================
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_for_loop;
-
-    /** Represents the for-loop: `for(;\p cond; \p updater) { \p body }`.
-     *
-     * The BinaryExpression for the condition and the Assignment
-     * for the updater are unpacked and their components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] cond    The BinaryExpression representing the condition.
-     * @param[in] updater The Assignment representing the updater.
-     * @param[in] body    The body of the for-loop.
-     */
-    void op_for_loop(const BinaryExpression<TileOperand &, TileOperand &> &cond,
-                     const Assignment<TileOperand &, TileOperand &>       &updater,
-                     const std::function<void()>                          &body)
-    {
-        TWriter::op_for_loop(cond.lhs, cond.opcode, cond.rhs, updater.lhs, updater.opcode, updater.rhs, body);
-    }
-
-    // ==================================================
-    // Unary expressions
-    // ==================================================
-
-    // Un-hide original implementation, in case the original implementation is required.
-    using TWriter::op_assign;
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The UnaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The UnaryExpression representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const UnaryExpression<TileOperand &> &exp)
-    {
-        TWriter::op_unary_expression(dst, exp.opcode, exp.src);
-    }
-
-    // ==================================================
-    // Binary expressions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const BinaryExpression<TileOperand &, TileOperand &> &exp)
-    {
-        TWriter::op_binary_expression(dst, exp.lhs, exp.opcode, exp.rhs);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    template <typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryExpression<TileOperand &, TRight> &exp)
-    {
-        std::cout << "Beginning assignment!" << std::endl;
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.rhs);
-        TWriter::op_binary_expression(dst, exp.lhs, exp.opcode, tmp1);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft>
-    void op_assign(const TileOperand &dst, const BinaryExpression<TLeft, TileOperand &> &exp)
-    {
-        std::cout << "Beginning assignment!" << std::endl;
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.lhs);
-        TWriter::op_binary_expression(dst, tmp1, exp.opcode, exp.rhs);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryExpression is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryExpression representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft, typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryExpression<TLeft, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.lhs);
-        op_assign(tmp2, exp.rhs);
-        TWriter::op_binary_expression(dst, tmp1, exp.opcode, tmp2);
-    }
-
-    // ==================================================
-    // Unary elementwise functions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The UnaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The UnaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const UnaryElementwiseFunction<TileOperand &> &exp)
-    {
-        TWriter::op_unary_elementwise_function(dst, exp.opcode, exp.src);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The UnaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The UnaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TArg>
-    void op_assign(const TileOperand &dst, const UnaryElementwiseFunction<TArg> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.lhs);
-        TWriter::op_unary_elementwise_function(dst, exp.opcode, tmp1);
-    }
-
-    // ==================================================
-    // Binary elementwise functions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TileOperand &, TileOperand &> &exp)
-    {
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, exp.first, exp.second);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TileOperand &, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.second);
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, exp.first, tmp1);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft>
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TLeft, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, tmp1, exp.second);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The BinaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The BinaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TLeft, typename TRight>
-    void op_assign(const TileOperand &dst, const BinaryElementwiseFunction<TLeft, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.second);
-        TWriter::op_binary_elementwise_function(dst, exp.opcode, tmp1, tmp2);
-    }
-
-    // ==================================================
-    // Ternary elementwise functions
-    // ==================================================
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    void op_assign(const TileOperand                                                             &dst,
-                   const TernaryElementwiseFunction<TileOperand &, TileOperand &, TileOperand &> &exp)
-    {
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, exp.second, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TileOperand &, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, exp.second, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TSecond>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TSecond, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.second);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, tmp1, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TileOperand &, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, exp.second, tmp1);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst, typename TSecond>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TSecond, TileOperand &> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.second);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, tmp2, exp.third);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst, typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TileOperand &, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, exp.second, tmp2);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TSecond, typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TileOperand &, TSecond, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.second);
-        op_assign(tmp2, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, exp.first, tmp1, tmp2);
-    }
-
-    /** Represents the assignment: `\p dst = \p exp`.
-     *
-     * The TernaryElementwiseFunction is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] dst The tile which is assigned to.
-     * @param[in] exp The TernaryElementwiseFunction representing the expression to be evaluated and assigned.
-     */
-    template <typename TFirst, typename TSecond, typename TThird>
-    void op_assign(const TileOperand &dst, const TernaryElementwiseFunction<TFirst, TSecond, TThird> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(dst.tile_info(), dst.tile_info(), dst.tile_info());
-        auto &tmp2 = declare_temp_tile(dst.tile_info());
-        auto &tmp3 = declare_temp_tile(dst.tile_info());
-        op_assign(tmp1, exp.first);
-        op_assign(tmp2, exp.second);
-        op_assign(tmp3, exp.third);
-        TWriter::op_ternary_elementwise_function(dst, exp.opcode, tmp1, tmp2, tmp3);
-    }
-
-    // ==================================================
-    // Assignments
-    // ==================================================
-
-    /** Represents the assignment: `\p lhs += \p rhs` or `\p lhs -= \p rhs`.
-     *
-     * The Assignment is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @param[in] exp The Assignment representing the expression to be evaluated.
-     */
-    void op_assign(const Assignment<TileOperand &, TileOperand &> &exp)
-    {
-        if (exp.opcode == AssignmentOp::Increment)
-        {
-            TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Add, exp.rhs);
-        }
-        else if (exp.opcode == AssignmentOp::Decrement)
-        {
-            TWriter::op_binary_expression(exp.lhs, exp.lhs, BinaryOp::Sub, exp.rhs);
-        }
-    }
-
-    /** Represents the assignment: `\p lhs += \p rhs` or `\p lhs -= \p rhs`.
-     *
-     * The Assignment is unpacked and its components are forwarded to
-     * the underlying KernelWriter's implementation.
-     *
-     * @tparam    TRight The type of the RHS of the assignment.
-     * @param[in] exp    The Assignment representing the expression to be evaluated.
-     */
-    template <typename TRight>
-    void op_assign(const Assignment<TileOperand &, TRight> &exp)
-    {
-        auto &tmp1 = declare_temp_tile(exp.lhs.tile_info());
-        op_assign(tmp1, exp.rhs);
-        op_assign(Assignment<TileOperand &, TileOperand &>{exp.lhs, tmp1, exp.opcode});
-    }
-
-private:
-    unsigned int temp_var_counter = 0;
-
-    /** Return the current counter value, then increment it.
-     *
-     * @return The current counter value.
-     */
-    int next_ctr()
-    {
-        return temp_var_counter++;
-    }
-
-    /** Gets the next temporary variable counter value,
-     * and returns a suitable temporary variable name.
-     *
-     * @return A temporary variable name.
-     */
-    std::string next_tmp_var_name()
-    {
-        return "tmp_" + std::to_string(next_ctr());
-    }
-
-    /** Returns the argument.
-     *
-     * Used for recursion with the variadic function version of this function.
-     *
-     * @param[in] arg The TileInfo to return.
-     * @return    The \p arg.
-     */
-    TileInfo get_largest_size(const TileInfo &arg)
-    {
-        return arg;
-    }
-
-    /** Returns a TileInfo object where the size in each dimension (width, height) is the largest
-     * of either TileInfo argument in the corresponding dimension.
-     *
-     * @tparam    TOps   Must be of TileInfo type.
-     * @param[in] first  A TileInfo object.
-     * @param[in] second A TileInfo object.
-     * @param[in] ops    A number of TileInfo objects.
-     * @return    A TileInfo object which represents the largest shape in each dimension across the arguments.
-     */
-    template <typename... TOps, typename = ::std::enable_if_t<std::is_same<TOps..., TileInfo>::value>>
-    TileInfo get_largest_size(const TileInfo &first, const TileInfo &second, const TOps &...ops)
-    {
-        TileInfo largest = {first.data_type(), std::max(first.width(), second.width()),
-                            std::max(first.height(), second.height())};
-        return get_largest_size(largest, ops...);
-    }
-
-    /** Helper function to define a suitable TileOperand with appropriate TileInfo
-     * such that broadcasting is taken into account, based on the arguments provided.
-     *
-     * @tparam     TArgs Must be of TileInfo type.
-     * @param[in]  args  A number of TileInfo which determine the shape of the TileOperand to declare.
-     * @return     A newly created TileOperand.
-     */
-    template <typename... TArgs, typename = ::std::enable_if_t<std::is_same<TArgs..., TileInfo>::value>>
-    TileOperand &declare_temp_tile(const TArgs &...args)
-    {
-        return TWriter::declare_tile(next_tmp_var_name().c_str(), get_largest_size(args...));
-    }
-};
-
-} // namespace ckw
-
-#endif // CKW_INCLUDE_CKW_KERNELWRITERHELPER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/OperandBase.h b/compute_kernel_writer/prototype/include/ckw/OperandBase.h
deleted file mode 100644
index 9842127339..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/OperandBase.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
-
-#include "ckw/types/DataType.h"
-
-#include <string>
-
-namespace ckw
-{
-namespace prototype
-{
-class IGpuKernelWriter;
-
-class Operand;
-} // namespace prototype
-
-/** The base class for all operands. */
-class OperandBase
-{
-public:
-    /** Constructor
-     *
-     * @param[in] name The name of the operand.
-     */
-    explicit OperandBase(const ::std::string &name);
-
-    /** Destructor */
-    virtual ~OperandBase();
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const = 0;
-
-    /** Get the name of the operand. */
-    const ::std::string &name() const;
-
-    /** Set the name of the operand. */
-    OperandBase &name(const ::std::string &name);
-
-    /** Get the data type of the operand. */
-    virtual DataType data_type() const = 0;
-
-    /** Get whether the operand is compile-time constant. */
-    virtual bool is_constant() const = 0;
-
-private:
-    ::std::string _name;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_OPERANDBASE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h b/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
deleted file mode 100644
index 2a9c42acc8..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/ScalarValue.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
-
-#include "ckw/Error.h"
-
-#include <cstdint>
-
-namespace ckw
-{
-
-/** The scalar value known at compile-time. */
-class ScalarValue
-{
-public:
-    /** Initialize a new instance of @ref ScalarValue class with integer value 0. */
-    ScalarValue()
-    {
-        _type      = Type::INT;
-        _value.i64 = 0;
-    }
-
-    /** Initialize a new instance of @ref ScalarValue class with the specified value. */
-    template <typename T>
-    ScalarValue(T value)
-    {
-        set(value);
-    }
-
-    /** Set the value. */
-    template <typename T>
-    void set(T value)
-    {
-        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
-        CKW_ASSERT(sizeof(T) <= 8);
-
-        _size = sizeof(T);
-
-        if (::std::is_integral<T>::value)
-        {
-            if (::std::is_signed<T>::value)
-            {
-                _type      = Type::INT;
-                _value.i64 = value;
-            }
-            else
-            {
-                _type      = Type::UINT;
-                _value.u64 = value;
-            }
-        }
-        else
-        {
-            _type      = Type::FLOAT;
-            _value.f64 = value;
-        }
-    }
-
-    /** Get the value.
-     *
-     * The caller must make sure that what has been stored in the object must fit
-     * the output data type without data corruption or loss of accuracy.
-     */
-    template <typename T>
-    T get() const
-    {
-        CKW_ASSERT(::std::is_integral<T>::value || ::std::is_floating_point<T>::value);
-        CKW_ASSERT(sizeof(T) >= _size);
-
-        if (::std::is_integral<T>::value)
-        {
-            if (::std::is_signed<T>::value)
-            {
-                CKW_ASSERT(_type == Type::INT || _type == Type::UINT);
-                CKW_ASSERT_IF(_type == Type::UINT, sizeof(T) > _size);
-
-                return _value.i64;
-            }
-            else
-            {
-                CKW_ASSERT(_type == Type::INT);
-
-                return _value.u64;
-            }
-        }
-        else
-        {
-            return _value.f64;
-        }
-    }
-
-private:
-    union Value
-    {
-        int64_t  i64;
-        uint64_t u64;
-        double   f64;
-    };
-
-    enum class Type : int32_t
-    {
-        UINT,
-        INT,
-        FLOAT,
-    };
-
-    Value    _value{};
-    Type     _type{};
-    uint32_t _size{};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_SCALARVALUE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h b/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
deleted file mode 100644
index 24da7dc8ab..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TensorInfo.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H
-
-#include "ckw/types/DataType.h"
-
-#include <array>
-#include <cstdint>
-
-namespace ckw
-{
-/** Compute Kernel Writer tensor data layout (or memory format) */
-enum class TensorDataLayout
-{
-    Unknown,
-    Nhwc,
-    Ndhwc
-};
-
-/** Compute Kernel Writer tensor data layout component */
-enum class TensorDataLayoutComponent
-{
-    Unknown,
-    N,
-    D,
-    H,
-    W,
-    C,
-};
-
-/** Compute Kernel Writer tensor component bitmask. The bitmask can be used to retrieve
- *  the info from @ref TensorComponent.
- */
-enum class TensorComponentBitmask : uint32_t
-{
-    OffsetFirstElement = 0x01000000, // For example, OffsetFirstElement in @ref TensorComponent
-    Stride             = 0x02000000, // For example, stride0 in @ref TensorComponent
-    Dimension          = 0x04000000, // For example, Dim0 in @ref TensorComponent
-    FoldedDimensions   = 0x08000000, // For example, Dim0xDim1 in @ref TensorComponent
-};
-
-/** Compute Kernel Writer tensor component. The tensor components are used to access specific backend-agnostic tensor arguments,
- *  such as the tensor dimensions and tensor strides.
- *  The data type is represented as an integer. The value of the integer value
- *  is assigned to retrieve the information through the @ref TensorComponentBitmask.
- */
-enum class TensorComponentType : uint32_t
-{
-    Unknown            = 0x00000000,
-    OffsetFirstElement = 0x01000000,
-    Stride0            = 0x02000001,
-    Stride1            = 0x02000010,
-    Stride2            = 0x02000100,
-    Stride3            = 0x02001000,
-    Stride4            = 0x02010000,
-    Dim0               = 0x04000001,
-    Dim1               = 0x04000010,
-    Dim2               = 0x04000100,
-    Dim3               = 0x04001000,
-    Dim4               = 0x04010000,
-    Dim1xDim2          = 0x08000110,
-    Dim2xDim3          = 0x08001100,
-    Dim1xDim2xDim3     = 0x08001110
-};
-
-/** Compute Kernel Writer tensor storage. The tensor storage represents the type of tensor memory object.
- */
-enum class TensorStorageType : uint32_t
-{
-    Unknown            = 0x00000000,
-    BufferUint8Ptr     = 0x01000000,
-    Texture2dReadOnly  = 0x02000001,
-    Texture2dWriteOnly = 0x02000010,
-};
-
-/** Compute Kernel Writer tensor shape
- *  Negative dimensions can be interpreted as dynamic dimensions by the Compute Kernel Writer
- */
-using TensorShape = std::array<int32_t, 5>;
-
-/** Compute Kernel Writer tensor info */
-class TensorInfo
-{
-public:
-    /** Constructor
-     *
-     * @param[in] dt    Tensor data type
-     * @param[in] shape Tensor shape
-     * @param[in] dl    Tensor data layout
-     * @param[in] id    Tensor id. The id is used to keep track of the bound user tensor. Through the id,
-     *                  the user can know what tensor has been used by the Compute Kernel Writer.
-     *                  Possible id values:
-     *                  - greater than or equal to 0: bind a user specific tensors
-     *                  - less than 0: bind a virtual tensor (tile)
-     */
-    TensorInfo(DataType dt, const TensorShape &shape, TensorDataLayout dl, int32_t id);
-
-    /** Set shape */
-    TensorInfo &shape(const TensorShape &shape);
-
-    /** Get shape */
-    TensorShape shape() const;
-
-    /** Set data type */
-    TensorInfo &data_type(DataType dt);
-
-    /** Get data type */
-    DataType data_type() const;
-
-    /** Set data layout */
-    TensorInfo &data_layout(TensorDataLayout dl);
-
-    /** Get data layout */
-    TensorDataLayout data_layout() const;
-
-    /** Set id */
-    TensorInfo &id(int32_t id);
-
-    /** Get layout */
-    int32_t id() const;
-
-private:
-    TensorShape      _shape{{0}};
-    DataType         _dt{DataType::Unknown};
-    TensorDataLayout _dl{TensorDataLayout::Unknown};
-    int32_t          _id{-1};
-};
-} // namespace ckw
-
-#endif /* CKW_PROTOTYPE_INCLUDE_CKW_TENSORINFO_H */
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h b/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
deleted file mode 100644
index c221b449fa..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TensorOperand.h
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
-
-#include "ckw/OperandBase.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileOperand.h"
-#include "ckw/types/DataType.h"
-
-#include <memory>
-
-namespace ckw
-{
-
-class TensorComponentOperand;
-
-// =================================================================================================
-// TensorOperand
-// =================================================================================================
-
-/** Tensor operand */
-class TensorOperand : public OperandBase
-{
-public:
-    /** Initialize a new instance of @ref TensorOperand class.
-     *
-     * @param[in] name         The name of the tensor.
-     * @param[in] info         The tensor info.
-     * @param[in] storage_type The tensor storage type.
-     */
-    TensorOperand(const ::std::string &name, const TensorInfo &info, TensorStorageType storage_type);
-
-    /** No copy constructor. */
-    TensorOperand(const TensorOperand &other) = delete;
-
-    /** No copy assignment. */
-    TensorOperand &operator=(const TensorOperand &other) = delete;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-    /** Get the tensor info. */
-    const TensorInfo &info() const;
-
-    /** Get the tensor info. */
-    TensorInfo &info();
-
-    /** Get the tensor storage type. */
-    TensorStorageType storage_type() const;
-
-    /** Get the data type. */
-    virtual DataType data_type() const override;
-
-    /** Get whether the tensor is compile-time constant. */
-    virtual bool is_constant() const override;
-
-    /** Get the default tile attached to the tensor. */
-    const TileOperand &tile() const;
-
-    /** Get the default tile attached to the tensor. */
-    TileOperand &tile();
-
-    /** Set the default tile attached to the tensor. */
-    TensorOperand &tile(TileOperand &tile);
-
-    /** Get the tensor sampler of the default tile. */
-    const TensorTileSampler &tile_sampler() const;
-
-    /** Get the tensor sampler of the default tile. */
-    TensorTileSampler &tile_sampler();
-
-    /** Set the tensor sampler of the default tile. */
-    TensorOperand &tile_sampler(const TensorTileSampler &value);
-
-    /** Get the operand that contains the stride in y dimension of the tensor. */
-    TensorComponentOperand &stride1();
-
-    /** Get the operand that contains the stride in z dimension of the tensor. */
-    TensorComponentOperand &stride2();
-
-    /** Get the operand that contains the stride in w dimension of the tensor. */
-    TensorComponentOperand &stride3();
-
-    /** Get the operand that contains the stride in w dimension of the tensor. */
-    TensorComponentOperand &stride4();
-
-    /** Get the operand that contains the size of dimension 0 of the tensor. */
-    TensorComponentOperand &dim0();
-
-    /** Get the operand that contains the size of dimension 1 of the tensor. */
-    TensorComponentOperand &dim1();
-
-    /** Get the operand that contains the size of dimension 2 of the tensor. */
-    TensorComponentOperand &dim2();
-
-    /** Get the operand that contains the size of dimension 3 of the tensor. */
-    TensorComponentOperand &dim3();
-
-    /** Get the operand that contains the size of dimension 4 of the tensor. */
-    TensorComponentOperand &dim4();
-
-    /** Get the operand that contains the size of dimensions 1 and 2 collapsed. */
-    TensorComponentOperand &dim1_dim2();
-
-    /** Get the operand that contains the size of dimensions 1, 2 and 3 collapsed. */
-    TensorComponentOperand &dim1_dim2_dim3();
-
-    /** Get the operand that contains the offset in bytes to the first element. */
-    TensorComponentOperand &offset_first_element_in_bytes();
-
-private:
-    TensorInfo        _info;
-    TensorStorageType _storage_type;
-
-    TileOperand      *_tile{nullptr};
-    TensorTileSampler _tile_sampler{};
-
-    ::std::unique_ptr<TensorComponentOperand> _stride1{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _stride2{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _stride3{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _stride4{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim0{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim1{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim2{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim3{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim4{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _dim1_dim2_dim3{nullptr};
-    ::std::unique_ptr<TensorComponentOperand> _offset_first_element_in_bytes{nullptr};
-};
-
-// =================================================================================================
-// TensorComponentOperand
-// =================================================================================================
-
-/** Tile operand that contains tensor information. */
-class TensorComponentOperand : public TileOperand
-{
-public:
-    /** Initialize a new instance of @ref TensorComponentOperand class.
-     *
-     * @param[in] tensor    The tensor operand.
-     * @param[in] component The tensor info component.
-     */
-    TensorComponentOperand(TensorOperand &tensor, TensorComponentType component);
-
-    /** Get the tensor operand. */
-    TensorOperand &tensor();
-
-    /** Get the tensor operand. */
-    const TensorOperand &tensor() const;
-
-    /** Get the tensor component. */
-    TensorComponentType component_type() const;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-private:
-    TensorOperand      &_tensor;
-    TensorComponentType _component;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_TENSOROPERAND_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h b/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
deleted file mode 100644
index 606dec3535..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TensorTileSampler.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
-
-#include "ckw/types/TensorSamplerTypes.h"
-
-#include <functional>
-
-namespace ckw
-{
-
-class TileOperand;
-
-/** Tensor sampler
- *
- * It contains information about how the result tile should be stored to tensor memory.
- * It can also be used to dictate how the subsequent operators fetch the input tensor.
- */
-class TensorTileSampler
-{
-public:
-    /** Initialize a new instance of @ref TensorSampler class. */
-    TensorTileSampler();
-
-    /** Initialize a new instance of @ref TensorSampler class.
-     *
-     * @param[in] x              The coordinate in the x dimension.
-     * @param[in] y              The coordinate in the y dimension.
-     * @param[in] z              The coordinate in the z dimension.
-     * @param[in] b              The coordinate in the batch dimension.
-     * @param[in] format         The tensor data format.
-     * @param[in] address_mode_x The address mode of the x dimension.
-     * @param[in] address_mode_y The address mode of the y dimension.
-     * @param[in] address_mode_z The address mode of the z dimension.
-     */
-    TensorTileSampler(TileOperand              &x,
-                      TileOperand              &y,
-                      TileOperand              &z,
-                      TileOperand              &b,
-                      TensorSamplerFormat       format,
-                      TensorSamplerAddressModeX address_mode_x,
-                      TensorSamplerAddressModeY address_mode_y,
-                      TensorSamplerAddressModeZ address_mode_z);
-
-    /** Initialize a new instance of @ref TensorSampler class.
-     *
-     * @param[in] x              The coordinate in the x dimension.
-     * @param[in] y              The coordinate in the y dimension.
-     * @param[in] z              The coordinate in the z dimension.
-     * @param[in] b              The coordinate in the batch dimension.
-     * @param[in] height         The height of the tile.
-     * @param[in] width          The width of the tile.
-     * @param[in] format         The tensor data format.
-     * @param[in] address_mode_x The address mode of the x dimension.
-     * @param[in] address_mode_y The address mode of the y dimension.
-     * @param[in] address_mode_z The address mode of the z dimension.
-     */
-    TensorTileSampler(TileOperand              &x,
-                      TileOperand              &y,
-                      TileOperand              &z,
-                      TileOperand              &b,
-                      int32_t                   height,
-                      int32_t                   width,
-                      TensorSamplerFormat       format,
-                      TensorSamplerAddressModeX address_mode_x,
-                      TensorSamplerAddressModeY address_mode_y,
-                      TensorSamplerAddressModeZ address_mode_z);
-
-    /** Get the coordinate in the x dimension. */
-    const TileOperand &x() const;
-
-    /** Set the coordinate in the x dimension. */
-    TensorTileSampler &x(TileOperand &x);
-
-    /** Get the coordinate in the y dimension. */
-    const TileOperand &y() const;
-
-    /** Set the coordinate in the y dimension. */
-    TensorTileSampler &y(TileOperand &y);
-
-    /** Get the coordinate in the z dimension. */
-    const TileOperand &z() const;
-
-    /** Set the coordinate in the z dimension. */
-    TensorTileSampler &z(TileOperand &z);
-
-    /** Get the coordinate in the batch dimension. */
-    const TileOperand &b() const;
-
-    /** Set the coordinate in the batch dimension. */
-    TensorTileSampler &b(TileOperand &b);
-
-    /** Get the width of the tile. */
-    int32_t width() const;
-
-    /** Set the width of the tile. */
-    TensorTileSampler &width(int32_t width);
-
-    /** Get the height of the tile. */
-    int32_t height() const;
-
-    /** Set the height of the tile. */
-    TensorTileSampler &height(int32_t height);
-
-    /** Get the format of the tensor. */
-    TensorSamplerFormat format() const;
-
-    /** Set the format of the tensor. */
-    TensorTileSampler &format(TensorSamplerFormat format);
-
-    /** Get the address mode of the x dimension. */
-    TensorSamplerAddressModeX address_mode_x() const;
-
-    /** Set the address mode of the x-dimension. */
-    TensorTileSampler &address_mode_x(TensorSamplerAddressModeX address_mode_x);
-
-    /** Get the address mode of the y dimension. */
-    TensorSamplerAddressModeY address_mode_y() const;
-
-    /** Set the address mode of the y dimension. */
-    TensorTileSampler &address_mode_y(TensorSamplerAddressModeY address_mode_y);
-
-    /** Get the address mode of the z dimension. */
-    TensorSamplerAddressModeZ address_mode_z() const;
-
-    /** Set the address mode of the z dimension. */
-    TensorTileSampler &address_mode_z(TensorSamplerAddressModeZ address_mode_z);
-
-private:
-    TileOperand *_x{nullptr};
-    TileOperand *_y{nullptr};
-    TileOperand *_z{nullptr};
-    TileOperand *_b{nullptr};
-
-    int32_t _height{0};
-    int32_t _width{0};
-
-    TensorSamplerFormat       _format{TensorSamplerFormat::Unknown};
-    TensorSamplerAddressModeX _address_mode_x{TensorSamplerAddressModeX::Unknown};
-    TensorSamplerAddressModeY _address_mode_y{TensorSamplerAddressModeY::Unknown};
-    TensorSamplerAddressModeZ _address_mode_z{TensorSamplerAddressModeZ::Unknown};
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_TENSORTILESAMPLER_H
diff --git a/compute_kernel_writer/prototype/include/ckw/TileInfo.h b/compute_kernel_writer/prototype/include/ckw/TileInfo.h
deleted file mode 100644
index e0d064169e..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TileInfo.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TILEINFO_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TILEINFO_H
-
-#include "ckw/types/DataType.h"
-
-#include <array>
-#include <cstdint>
-
-namespace ckw
-{
-// Constants to access the tile width and height in the TileShape
-constexpr int32_t kTileWidthIdx  = 0;
-constexpr int32_t kTileHeightIdx = 1;
-
-/** Compute Kernel Writer tile shape. It is used to define the shape of the tile */
-using TileShape = std::array<int32_t, 2>;
-
-/** Compute Kernel Writer tile info */
-class TileInfo
-{
-public:
-    /** Constructor used to initialize a scalar variable with a given data type
-     *
-     * @param[in] dt Tile data type
-     */
-    TileInfo(DataType dt);
-
-    /** Constructor used to initialize a vector with a given data type and vector length.
-     *
-     * @param[in] dt Tile data type
-     * @param[in] w  Tile width (or vector length)
-     */
-    TileInfo(DataType dt, int32_t w);
-
-    /** Constructor used to initialize a tile with a given data type and tile sizes.
-     *
-     * @param[in] dt Tile data type
-     * @param[in] h  Tile height
-     * @param[in] w  Tile width
-     */
-    TileInfo(DataType dt, int32_t h, int32_t w);
-
-    /** Set width */
-    TileInfo &width(int32_t w);
-
-    /** Get width */
-    int32_t width() const;
-
-    /** Set height */
-    TileInfo &height(int32_t h);
-
-    /** Get height */
-    int32_t height() const;
-
-    /** Set data type */
-    TileInfo &data_type(DataType dt);
-
-    /** Get data type */
-    DataType data_type() const;
-
-private:
-    DataType  _dt{DataType::Unknown};
-    TileShape _shape{};
-};
-
-} // namespace ckw
-
-#endif /* COMPUTE_KERNEL_WRITER_INCLUDE_CKW_TILEINFO_H */
diff --git a/compute_kernel_writer/prototype/include/ckw/TileOperand.h b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
deleted file mode 100644
index 24ee373a24..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/TileOperand.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
-#define CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
-
-#include "ckw/Error.h"
-#include "ckw/OperandBase.h"
-#include "ckw/ScalarValue.h"
-#include "ckw/TileInfo.h"
-
-#include <vector>
-
-namespace ckw
-{
-
-class Kernel;
-
-using TileContainer = std::vector<std::vector<std::string>>;
-
-/** Tile operand which can be either scalar, vector or 2D tile. */
-class TileOperand : public OperandBase
-{
-public:
-    /** Initialize a new instance of @ref TileOperand class with the tile information.
-     *
-     * @param[in] name      The name of the tile.
-     * @param[in] tile_info The tile info.
-     */
-    TileOperand(const ::std::string &name, const TileInfo &tile_info);
-
-    /** Initialize a new instance of @ref TileOperand for scalar variable.
-     *
-     * @param[in] name      The name of the tile.
-     * @param[in] data_type The data type of the tile.
-     */
-    TileOperand(const ::std::string &name, DataType data_type);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, int32_t value);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant scalar variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, float value);
-
-    /** Initialize a new instance of @ref TileOperand for compile-time constant variable.
-     *
-     * @param[in] name  The name of the tile.
-     * @param[in] value The value of the tile.
-     */
-    TileOperand(const ::std::string &name, const ::std::vector<std::vector<std::string>> &value, DataType dt);
-
-    /** Prohibit copy of tile operand. */
-    TileOperand(const TileOperand &) = delete;
-
-    /** Prohibit copy of tile operand. */
-    TileOperand &operator=(const TileOperand &) = delete;
-
-    /** (Internal use only) Create the implementation operand.
-     *
-     * @param[in] writer The implementation kernel writer.
-     */
-    virtual prototype::Operand create_impl_operand(prototype::IGpuKernelWriter *writer) const override;
-
-    /** Get the tile info. */
-    const TileInfo &tile_info() const;
-
-    /** Get the data type of the tile. */
-    virtual DataType data_type() const override;
-
-    /** Get whether the tile is compile-time constant. */
-    virtual bool is_constant() const override;
-
-    /** Get whether the tile is a scalar value. */
-    bool is_scalar() const;
-
-    /** Get the scalar value of the tile.
-     *
-     * The tile must have the shape of 1, 1 (i.e. scalar).
-     *
-     * @return Scalar value as a string.
-     */
-    std::string scalar_value() const;
-
-    /** Get the values of the tile.
-     *
-     * @return 2D container of values.
-     */
-    const TileContainer &value() const;
-
-private:
-    TileInfo      _info;
-    TileContainer _value{};
-    bool          _constant;
-};
-
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_INCLUDE_CKW_TILEOPERAND_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/ConvertPolicy.h b/compute_kernel_writer/prototype/include/ckw/types/ConvertPolicy.h
deleted file mode 100644
index 2a198507eb..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/ConvertPolicy.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_CONVERTPOLICY_H
-#define CKW_INCLUDE_CKW_CONVERTPOLICY_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class ConvertPolicy : int32_t
-{
-    None     = 0, // No policy specified.
-    Saturate = 1, // Saturated.
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_CONVERTPOLICY_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/DataType.h b/compute_kernel_writer/prototype/include/ckw/types/DataType.h
deleted file mode 100644
index 3447dd61d6..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/DataType.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef CKW_INCLUDE_CKW_DATATYPE_H
-#define CKW_INCLUDE_CKW_DATATYPE_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-/** Compute Kernel Writer data types. This data type is used by the code variables and tensor arguments. */
-enum class DataType : int32_t
-{
-    Unknown = 0x00,
-    Fp32    = 0x11,
-    Fp16    = 0x12,
-    Int32   = 0x21,
-    Int16   = 0x22,
-    Int8    = 0x24,
-    Uint32  = 0x31,
-    Uint16  = 0x32,
-    Uint8   = 0x34,
-    Bool    = 0x41
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_DATATYPE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Functions.h b/compute_kernel_writer/prototype/include/ckw/types/Functions.h
deleted file mode 100644
index c6afaa0ac8..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/Functions.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef CKW_INCLUDE_CKW_FUNCTIONS_H
-#define CKW_INCLUDE_CKW_FUNCTIONS_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class UnaryFunction : int32_t
-{
-    Exp   = 0x0000,
-    Tanh  = 0x0001,
-    Sqrt  = 0x0002,
-    Erf   = 0x0003,
-    Fabs  = 0x0004,
-    Log   = 0x0006,
-    Round = 0x0007,
-    Floor = 0x0008,
-
-    // Misc
-    SizeOf = 0x0009,
-};
-
-enum class BinaryFunction : int32_t
-{
-    Min = 0x0000,
-    Max = 0x0001,
-};
-
-enum class TernaryFunction : int32_t
-{
-    Select = 0x0000,
-    Clamp  = 0x0001,
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_FUNCTIONS_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/GpuTargetLanguage.h b/compute_kernel_writer/prototype/include/ckw/types/GpuTargetLanguage.h
deleted file mode 100644
index 6c08617949..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/GpuTargetLanguage.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_GPUTARGETLANGUAGE_H
-#define CKW_INCLUDE_CKW_GPUTARGETLANGUAGE_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class GpuTargetLanguage : int32_t
-{
-    Unknown,
-    OpenCL
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_GPUTARGETLANGUAGE_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/Operators.h b/compute_kernel_writer/prototype/include/ckw/types/Operators.h
deleted file mode 100644
index b560996837..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/Operators.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
-* Copyright (c) 2023 Arm Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
-
-#ifndef CKW_INCLUDE_CKW_OPERATORS_H
-#define CKW_INCLUDE_CKW_OPERATORS_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class UnaryOp : int32_t
-{
-    LogicalNot = 0x0000, // !
-    BitwiseNot = 0x0001, // ~
-    Negate     = 0x0002, // -
-};
-
-/* Binary operations
-*/
-enum class BinaryOp : int32_t
-{
-    // Elementwise
-    Add = 0x0000, // +
-    Sub = 0x0001, // -
-    Mul = 0x0002, // *
-    Div = 0x0003, // /
-    Mod = 0x0004, // %
-    // Relational
-    Equal        = 0x1000, // ==
-    Less         = 0x1001, // <
-    LessEqual    = 0x1002, // <=
-    Greater      = 0x1003, // >
-    GreaterEqual = 0x1004, // >=
-    // Algebra
-    MatMul_Nt_Nt = 0x2000, // X
-    MatMul_Nt_T  = 0x2001, // X
-    MatMul_T_Nt  = 0x2002, // X
-    MatMul_T_T   = 0x2003, // X
-    Dot          = 0x2004, // .
-    // Logical
-    LogicalAnd = 0x3000, // &&
-    LogicalOr  = 0x3001, // ||
-    // Bitwise
-    BitwiseXOR = 0x4000, // ^
-};
-
-enum class AssignmentOp : int32_t
-{
-    // Unary
-    Increment = 0x0000, // +=
-    Decrement = 0x0001, // -=
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_OPERATORS_H
diff --git a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h b/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
deleted file mode 100644
index 63405a0764..0000000000
--- a/compute_kernel_writer/prototype/include/ckw/types/TensorSamplerTypes.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
-#define CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
-
-#include <cstdint>
-
-namespace ckw
-{
-
-enum class TensorSamplerFormat : int32_t
-{
-    Unknown = 0,
-    C_WH_1  = 1,
-    C_W_H   = 2
-};
-
-enum class TensorSamplerAddressModeX : int32_t
-{
-    Unknown = 0,
-    None    = 1, // The user guarantees that the X coordinate is always in-bound
-    OverlappingMin =
-        2 // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    // Leftover elements can be handled using overlapping. This involves processing some of the elements in the array twice.
-};
-
-enum class TensorSamplerAddressModeY : int32_t
-{
-    Unknown = 0,
-    None    = 1, // The user guarantees that the Y coordinate is always in-bound
-    OverlappingMin =
-        2, // (FIXED shapes only) Reduce the load/store length when x == 0 (MIN). The load length will be width % original length
-    Skip = 3, // Skip the read/write
-    SkipMinEdgeOnly =
-        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
-    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-    ClampToBorder      = 9, // Clamp to border which always has 0 value
-    ClampToBorderMinEdgeOnly = 10,
-    ClampToBorderMaxEdgeOnly = 11
-};
-
-enum class TensorSamplerAddressModeZ : int32_t
-{
-    Unknown = 0,
-    None    = 1, // The user guarantees that the Y coordinate is always in-bound
-    Skip    = 3, // Skip the read/write
-    SkipMinEdgeOnly =
-        4, // Skip greater than or equal to max only. The user guarantees that the Y coordinate is always >= 0
-    SkipMaxEdgeOnly    = 5, // Skip less than 0 only
-    ClampToNearest     = 6, // Clamp the coordinate to nearest edge (0 or max value allowed on Y)
-    ClampToMinEdgeOnly = 7, // Clamp the negative coordinate to 0 only. Therefore, we expect Y to be always < MAX
-    ClampToMaxEdgeOnly = 8, // Clamp the coordinate to the max value allowed on Y only. We expect Y to be always >= 0
-};
-
-} // namespace ckw
-
-#endif //CKW_INCLUDE_CKW_TENSORSAMPLERTYPES_H
diff --git a/compute_kernel_writer/prototype/src/Kernel.cpp b/compute_kernel_writer/prototype/src/Kernel.cpp
deleted file mode 100644
index 6228ed17d0..0000000000
--- a/compute_kernel_writer/prototype/src/Kernel.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/Kernel.h"
-
-#include "ckw/TensorOperand.h"
-#include "ckw/types/GpuTargetLanguage.h"
-
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-Kernel::Kernel(GpuTargetLanguage language) : Kernel{"unnamed", language}
-{
-}
-
-Kernel::Kernel(const char *name, GpuTargetLanguage language)
-    : _name(name),
-      _kernel(std::make_unique<prototype::GpuKernelWriterDataHolder>(language)),
-      _operands{},
-      _tensor_id_operands{}
-{
-}
-
-Kernel::~Kernel()
-{
-}
-
-const std::string &Kernel::name() const
-{
-    return _name;
-}
-
-void Kernel::name(const std::string &name)
-{
-    _name = name;
-}
-std::vector<KernelArgument> Kernel::arguments() const
-{
-    std::vector<KernelArgument> arguments;
-
-    const auto impl_args = _kernel->arguments.tensor_argument_declarations();
-
-    for (auto tensor_arg : impl_args)
-    {
-        auto tensor = _tensor_id_operands.at(tensor_arg->format().id);
-        arguments.push_back(*tensor);
-
-        for (auto component_arg : tensor_arg->component_declarations())
-        {
-            switch (component_arg)
-            {
-                case TensorComponentType::OffsetFirstElement:
-                    arguments.push_back(tensor->offset_first_element_in_bytes());
-                    break;
-
-                case TensorComponentType::Stride1:
-                    arguments.push_back(tensor->stride1());
-                    break;
-
-                case TensorComponentType::Stride2:
-                    arguments.push_back(tensor->stride2());
-                    break;
-
-                case TensorComponentType::Stride3:
-                    arguments.push_back(tensor->stride3());
-                    break;
-
-                case TensorComponentType::Stride4:
-                    arguments.push_back(tensor->stride4());
-                    break;
-
-                case TensorComponentType::Dim0:
-                    arguments.push_back(tensor->dim0());
-                    break;
-
-                case TensorComponentType::Dim1:
-                    arguments.push_back(tensor->dim1());
-                    break;
-
-                case TensorComponentType::Dim2:
-                    arguments.push_back(tensor->dim2());
-                    break;
-
-                case TensorComponentType::Dim3:
-                    arguments.push_back(tensor->dim3());
-                    break;
-
-                case TensorComponentType::Dim4:
-                    arguments.push_back(tensor->dim4());
-                    break;
-
-                case TensorComponentType::Dim1xDim2:
-                    arguments.push_back(tensor->dim1_dim2());
-                    break;
-
-                case TensorComponentType::Dim1xDim2xDim3:
-                    arguments.push_back(tensor->dim1_dim2_dim3());
-                    break;
-
-                default:
-                    CKW_ASSERT(false);
-            }
-        }
-    }
-
-    return arguments;
-}
-
-TileOperand &Kernel::register_operand(std::unique_ptr<TileOperand> operand)
-{
-    const auto &name = operand->name();
-    auto        ptr  = operand.get();
-
-    CKW_ASSERT(_operands.find(name) == _operands.end());
-    _operands[name] = std::move(operand);
-
-    return *ptr;
-}
-
-TensorOperand &Kernel::register_operand(std::unique_ptr<TensorOperand> operand)
-{
-    const auto  id   = operand->info().id();
-    const auto &name = operand->name();
-    auto        ptr  = operand.get();
-
-    CKW_ASSERT(_tensor_id_operands.find(id) == _tensor_id_operands.end());
-    CKW_ASSERT(_operands.find(name) == _operands.end());
-
-    _tensor_id_operands[id] = operand.get();
-    _operands[name]         = std::move(operand);
-
-    return *ptr;
-}
-
-prototype::GpuKernelWriterDataHolder *Kernel::impl()
-{
-    return _kernel.get();
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelArgument.cpp b/compute_kernel_writer/prototype/src/KernelArgument.cpp
deleted file mode 100644
index 24ace28eb3..0000000000
--- a/compute_kernel_writer/prototype/src/KernelArgument.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/KernelArgument.h"
-
-#include "ckw/Error.h"
-#include "ckw/TensorOperand.h"
-
-namespace ckw
-{
-
-KernelArgument::KernelArgument(TensorOperand &tensor) : _type(Type::TensorStorage), _id(tensor.info().id())
-{
-    _sub_id.tensor_storage_type = tensor.storage_type();
-}
-
-KernelArgument::KernelArgument(TensorComponentOperand &tensor_component)
-    : _type(Type::TensorComponent), _id(tensor_component.tensor().info().id())
-{
-    _sub_id.tensor_component_type = tensor_component.component_type();
-}
-
-KernelArgument::Type KernelArgument::type() const
-{
-    return _type;
-}
-
-int32_t KernelArgument::id() const
-{
-    return _id;
-}
-
-TensorStorageType KernelArgument::tensor_storage_type() const
-{
-    CKW_ASSERT(_type == Type::TensorStorage);
-    return _sub_id.tensor_storage_type;
-}
-
-TensorComponentType KernelArgument::tensor_component_type() const
-{
-    CKW_ASSERT(_type == Type::TensorComponent);
-    return _sub_id.tensor_component_type;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
deleted file mode 100644
index 9f58d9fefa..0000000000
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/KernelWriter.h"
-
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TensorOperand.h"
-
-#include "src/Prototype.h"
-
-#include <sstream>
-
-namespace ckw
-{
-
-namespace
-{
-
-inline prototype::TensorInfo create_impl_tensor_info(const TensorInfo &info)
-{
-    return prototype::TensorInfo{info.shape(), info.data_type(), info.data_layout(), info.id()};
-}
-
-} // namespace
-
-// =================================================================================================
-// Constructors and destructor
-// =================================================================================================
-
-KernelWriter::KernelWriter(Kernel &kernel)
-    : _kernel(&kernel),
-      _impl_attr(std::make_unique<prototype::GpuKernelWriterAttribute>()),
-      _impl(prototype::GpuKernelWriterFactory::create(_impl_attr.get(), kernel.impl()))
-{
-    _impl->set_IdSpace(1);
-}
-
-KernelWriter::~KernelWriter()
-{
-}
-
-// =================================================================================================
-// Scope management
-// =================================================================================================
-
-int32_t KernelWriter::id_space() const
-{
-    return _id_space;
-}
-
-KernelWriter &KernelWriter::id_space(int32_t id_space)
-{
-    CKW_ASSERT(id_space <= _max_id_space);
-
-    _id_space = id_space;
-    return *this;
-}
-
-int32_t KernelWriter::next_id_space()
-{
-    id_space(++_max_id_space);
-    return _id_space;
-}
-
-// =================================================================================================
-// Tensor and tile declaration
-// =================================================================================================
-
-TensorOperand &
-KernelWriter::declare_tensor_argument(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
-{
-    const auto var_name = generate_variable_name(name);
-
-    _impl->declare_argument(var_name, create_impl_tensor_info(info));
-
-    auto &operand = _kernel->register_operand(std::make_unique<TensorOperand>(var_name, info, storage_type));
-
-    return operand;
-}
-
-TileOperand &KernelWriter::declare_tile_argument(const std::string &name, int32_t value)
-{
-    const auto var_name = generate_variable_name(name);
-
-    auto &operand = _kernel->register_operand(std::make_unique<TileOperand>(var_name, value));
-
-    return operand;
-}
-
-std::string KernelWriter::generate_variable_name(const std::string &name) const
-{
-    std::stringstream var_name;
-
-    var_name << "_" << _id_space << "_" << name;
-
-    return var_name.str();
-}
-
-TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> operand_ptr)
-{
-    auto       &operand = _kernel->register_operand(std::move(operand_ptr));
-    const auto &name    = operand.name();
-
-    if (!operand.is_constant())
-    {
-        const auto &info = operand.tile_info();
-
-        _impl->declare_tile(name, prototype::TileInfo(info.data_type(), info.width(), info.height()));
-    }
-    else
-    {
-        _impl->declare_const_tile(name, operand.value(), operand.data_type());
-    }
-
-    return operand;
-}
-
-// =================================================================================================
-// Load and store
-// =================================================================================================
-
-void KernelWriter::op_load(TileOperand             &tile,
-                           const TensorOperand     &tensor,
-                           const TensorTileSampler &sampler,
-                           const TileOperand       &dilation_y)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-
-    auto impl_x = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b = sampler.b().create_impl_operand(_impl.get());
-
-    auto impl_dilation_y = dilation_y.create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b, impl_dilation_y);
-}
-
-void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-
-    auto impl_x = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b = sampler.b().create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->op_load_indirect(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
-}
-
-void KernelWriter::util_get_indirect_buffer(TileOperand             &tile,
-                                            const TensorOperand     &tensor,
-                                            const TensorTileSampler &sampler,
-                                            const TileOperand       &x,
-                                            const TileOperand       &y,
-                                            const TileOperand       &x_off,
-                                            const TileOperand       &y_off)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-
-    auto impl_x     = x.create_impl_operand(_impl.get());
-    auto impl_y     = y.create_impl_operand(_impl.get());
-    auto impl_x_off = x_off.create_impl_operand(_impl.get());
-    auto impl_y_off = y_off.create_impl_operand(_impl.get());
-
-    auto impl_dst = tile.create_impl_operand(_impl.get());
-
-    _impl->util_get_indirect_buffer(impl_dst, impl_tensor, impl_x, impl_y, impl_x_off, impl_y_off);
-}
-
-void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
-{
-    prototype::TensorOperand impl_tensor(
-        tensor.name(),
-        prototype::GpuSampler{sampler.format(), prototype::to_gpu_tensor_storage(tensor.storage_type()),
-                              sampler.address_mode_x(), sampler.address_mode_y(), sampler.address_mode_z()});
-    auto impl_src = tile.create_impl_operand(_impl.get());
-    auto impl_x   = sampler.x().create_impl_operand(_impl.get());
-    auto impl_y   = sampler.y().create_impl_operand(_impl.get());
-    auto impl_z   = sampler.z().create_impl_operand(_impl.get());
-    auto impl_b   = sampler.b().create_impl_operand(_impl.get());
-
-    _impl->op_store_immediate(impl_tensor, impl_src, impl_x, impl_y, impl_z, impl_b);
-}
-
-// =================================================================================================
-// Data processing
-// =================================================================================================
-
-void KernelWriter::op_assign(const TileOperand &dst, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_assign(impl_dst, impl_src);
-}
-
-void KernelWriter::op_cast_expression(const TileOperand &dst, const TileOperand &src, const ConvertPolicy policy)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_cast_expression(impl_dst, impl_src, policy);
-}
-
-void KernelWriter::op_binary_expression(const TileOperand &dst,
-                                        const TileOperand &lhs,
-                                        BinaryOp           op,
-                                        const TileOperand &rhs)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-
-    _impl->op_binary_expression(impl_dst, impl_lhs, op, impl_rhs);
-}
-
-void KernelWriter::op_unary_expression(const TileOperand &dst, UnaryOp op, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_unary_expression(impl_dst, op, impl_src);
-}
-
-void KernelWriter::op_unary_elementwise_function(const TileOperand &dst, UnaryFunction opcode, const TileOperand &src)
-{
-    auto impl_dst = dst.create_impl_operand(_impl.get());
-    auto impl_src = src.create_impl_operand(_impl.get());
-
-    _impl->op_unary_elementwise_function(impl_dst, opcode, impl_src);
-}
-
-void KernelWriter::op_binary_elementwise_function(const TileOperand &dst,
-                                                  BinaryFunction     opcode,
-                                                  const TileOperand &first,
-                                                  const TileOperand &second)
-{
-    auto impl_dst    = dst.create_impl_operand(_impl.get());
-    auto impl_first  = first.create_impl_operand(_impl.get());
-    auto impl_second = second.create_impl_operand(_impl.get());
-
-    _impl->op_binary_elementwise_function(impl_dst, opcode, impl_first, impl_second);
-}
-
-void KernelWriter::op_ternary_elementwise_function(const TileOperand &dst,
-                                                   TernaryFunction    opcode,
-                                                   const TileOperand &first,
-                                                   const TileOperand &second,
-                                                   const TileOperand &third)
-{
-    auto impl_dst    = dst.create_impl_operand(_impl.get());
-    auto impl_first  = first.create_impl_operand(_impl.get());
-    auto impl_second = second.create_impl_operand(_impl.get());
-    auto impl_third  = third.create_impl_operand(_impl.get());
-
-    _impl->op_ternary_elementwise_function(impl_dst, opcode, impl_first, impl_second, impl_third);
-}
-
-void KernelWriter::op_if(const TileOperand &lhs, BinaryOp op, const TileOperand &rhs, const std::function<void()> &body)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-
-    _impl->op_if_header(impl_lhs, op, impl_rhs);
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-void KernelWriter::op_else_if(const TileOperand           &lhs,
-                              BinaryOp                     op,
-                              const TileOperand           &rhs,
-                              const std::function<void()> &body)
-{
-    auto impl_lhs = lhs.create_impl_operand(_impl.get());
-    auto impl_rhs = rhs.create_impl_operand(_impl.get());
-
-    _impl->op_else_if_header(impl_lhs, op, impl_rhs);
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-void KernelWriter::op_else(const std::function<void()> &body)
-{
-    _impl->op_else_header();
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-void KernelWriter::op_for_loop(const TileOperand           &var_name,
-                               BinaryOp                     cond_op,
-                               const TileOperand           &cond_value_name,
-                               const TileOperand           &update_var_name,
-                               AssignmentOp                 update_op,
-                               const TileOperand           &update_value_name,
-                               const std::function<void()> &body)
-{
-    auto impl_var_name          = var_name.create_impl_operand(_impl.get());
-    auto impl_cond_value_name   = cond_value_name.create_impl_operand(_impl.get());
-    auto impl_update_var_name   = update_var_name.create_impl_operand(_impl.get());
-    auto impl_update_value_name = update_value_name.create_impl_operand(_impl.get());
-
-    _impl->op_for_loop_header(impl_var_name, cond_op, impl_cond_value_name, impl_update_var_name, update_op,
-                              impl_update_value_name);
-    _impl->compound_statement_begin();
-    body();
-    _impl->compound_statement_end();
-}
-
-// =================================================================================================
-// Misc
-// =================================================================================================
-
-void KernelWriter::op_get_global_id(const TileOperand &dst, int32_t dim)
-{
-    _impl->op_get_global_id(prototype::Operand(dst.name()), dim);
-}
-
-void KernelWriter::op_return()
-{
-    _impl->op_return();
-}
-
-// =================================================================================================
-// Code generation
-// =================================================================================================
-
-std::string KernelWriter::generate_code()
-{
-    return prototype::generate_code(*_kernel->impl(), _kernel->name());
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/OperandBase.cpp b/compute_kernel_writer/prototype/src/OperandBase.cpp
deleted file mode 100644
index e0617fdc06..0000000000
--- a/compute_kernel_writer/prototype/src/OperandBase.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/OperandBase.h"
-
-namespace ckw
-{
-
-OperandBase::OperandBase(const std::string &name) : _name(name)
-{
-}
-
-OperandBase::~OperandBase()
-{
-}
-
-const std::string &OperandBase::name() const
-{
-    return _name;
-}
-
-OperandBase &OperandBase::name(const std::string &name)
-{
-    _name = name;
-    return *this;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
deleted file mode 100644
index b392fe2651..0000000000
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ /dev/null
@@ -1,4189 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
-#define CKW_PROTOTYPE_SRC_PROTOTYPE_H
-
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/DataType.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/GpuTargetLanguage.h"
-#include "ckw/types/Operators.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
-#include <algorithm>
-#include <array>
-#include <cassert> // assert (to be removed)
-#include <chrono>
-#include <cmath>
-#include <cstdint> // int32_t
-#include <functional>
-#include <iostream> // cout (to be removed)
-#include <map>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace ckw
-{
-namespace prototype
-{
-
-// Dummy data structure for Size2D
-using Size2D = std::vector<int32_t>;
-
-// Dummy Status
-using Status = void;
-
-enum class ComponentType : int32_t
-{
-    Complex   = 0,
-    Simple    = 1,
-    Unfusable = 2
-};
-
-enum class GpuCompilationSpeed
-{
-    Fast = 0x00, // fast compilation may increase the latency of the network
-    Slow = 0x01  // slow compilation may decrease the latency of the network
-};
-
-enum class GpuExtensions
-{
-    Fp16,
-    Dot8,
-    Mmul,
-    FastMath
-};
-
-struct TensorInfo
-{
-    TensorShape      shape{{0}};
-    DataType         data_type{DataType::Unknown};
-    TensorDataLayout data_layout{TensorDataLayout::Nhwc};
-    int32_t          id{-1};
-};
-
-struct ComponentAttribute
-{
-    GpuCompilationSpeed compilation_speed{GpuCompilationSpeed::Fast};
-    bool                overwrite_tile{true};
-};
-
-inline std::string data_type_to_cl_type(DataType dt)
-{
-    switch (dt)
-    {
-        case DataType::Fp32:
-            return "float";
-        case DataType::Fp16:
-            return "half";
-        case DataType::Int8:
-            return "char";
-        case DataType::Uint8:
-            return "uchar";
-        case DataType::Uint16:
-            return "ushort";
-        case DataType::Int16:
-            return "short";
-        case DataType::Uint32:
-            return "uint";
-        case DataType::Int32:
-            return "int";
-        case DataType::Bool:
-            return "bool";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline int32_t width_to_cl_vector_size(int32_t width)
-{
-    switch (width)
-    {
-        case 1:
-            return 1;
-        case 2:
-            return 2;
-        case 3:
-            return 3;
-        case 4:
-            return 4;
-        case 5:
-        case 6:
-        case 7:
-        case 8:
-            return 8;
-        case 9:
-        case 10:
-        case 11:
-        case 12:
-        case 13:
-        case 14:
-        case 15:
-        case 16:
-            return 16;
-        default:
-            assert(false);
-            return 0;
-    }
-}
-
-inline std::string get_cl_data_type(DataType dt, int32_t width)
-{
-    std::string data_type;
-    int32_t     w = width_to_cl_vector_size(width);
-    data_type += data_type_to_cl_type(dt);
-    if (w != 1)
-    {
-        data_type += std::to_string(w);
-    }
-    return data_type;
-}
-
-inline std::string to_opencl_store(int32_t vector_length)
-{
-    if (vector_length != 1)
-    {
-        return "vstore" + std::to_string(vector_length) + "(";
-    }
-    else
-    {
-        return "*(";
-    }
-}
-
-struct TileInfo
-{
-    TileInfo()
-    {
-    }
-
-    TileInfo(DataType dt) : dt(dt), w(1), h(1)
-    {
-    }
-
-    TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1)
-    {
-    }
-
-    TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height)
-    {
-    }
-
-    DataType dt{DataType::Unknown}; // Data type of the tile
-    int32_t  w{0};                  // Width (i.e. c0 - portion of the channels)
-    int32_t  h{0};                  // Height (i.e. s0 - portion of the spatial dimensions)
-};
-
-inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
-{
-    o << a.w << " x " << a.h;
-    return o;
-}
-
-struct DataTypeAsString
-{
-    std::string str{""};
-    DataType    dt{DataType::Unknown};
-    int32_t     size{1};
-};
-
-struct ValueAsString
-{
-    std::string      str{""};
-    DataTypeAsString type{};
-};
-
-// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
-// A Tile is a collection of variables used to express a 2D data.
-class IScalarTile
-{
-public:
-    virtual ~IScalarTile() = default;
-
-    /** Method to get the scalar variable from a tile
-     * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     *
-     * @return the scalar variable as a string
-     */
-    virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
-
-    /** Method to get the list of underlying variable names used by the tile
-     *
-     * @return the list of variable names
-     */
-    virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
-
-    /** Method to get the name of the tile.
-     *
-     * @return the name of the tile
-     */
-    std::string name() const
-    {
-        return _basename;
-    }
-
-    /** Method to get the tile format
-     *
-     * @return the format
-     */
-    TileInfo format() const
-    {
-        return _format;
-    }
-
-    /** Method to know whether the tile is assignable or not (constant)
-     *
-     * @return true if the tile is assignable
-     */
-    virtual bool is_assignable() const = 0;
-
-    /** Method to know whether the tile needs to be declared
-     *
-     * @return true if the tile needs to be declared in the code before being used
-     */
-    virtual bool need_declaration() const = 0;
-
-protected:
-    TileInfo    _format{};     // Tile format
-    std::string _basename{""}; // Tile name
-};
-
-// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
-// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
-class IVectorTile : public IScalarTile
-{
-public:
-    virtual ~IVectorTile() = default;
-
-    /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
-     *  The user can query the list of supported width for the vectors through preferred_vector_sizes().
-     *
-     * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
-     *
-     * @return the vector variable as a string
-     */
-    virtual ValueAsString vector(int32_t y) const = 0;
-
-    /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
-     *
-     * @return the vector variable as a string
-     */
-    virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
-    /** Method to get the preferred vector sizes.
-     *
-     * @return a vector with the preferred vector sizes
-     */
-    //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
-};
-
-class ClTile : public IVectorTile
-{
-public:
-    ClTile(const std::string &name, TileInfo format)
-    {
-        _format   = format;
-        _basename = name;
-    }
-
-    ValueAsString scalar(int32_t x, int32_t y) const override
-    {
-        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, 1);
-        t.type.dt   = _format.dt;
-        t.type.size = 1;
-
-        // Check required because if the width has only one element, we cannot use .s0
-        if (_format.w != 1)
-        {
-            // Automatic broadcasting
-            t.str += ".s" + std::to_string(x);
-        }
-
-        return t;
-    }
-
-    ValueAsString vector(int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, _format.w);
-        t.type.dt   = _format.dt;
-        t.type.size = _format.w;
-        return t;
-    }
-
-    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = build_variable_name(y);
-        t.type.str  = get_cl_data_type(_format.dt, width);
-        t.type.dt   = _format.dt;
-        t.type.size = width;
-
-        if (_format.w != 1)
-        {
-            t.str += ".s";
-            for (int i = 0; i < width; ++i)
-            {
-                t.str += to_scalar_hex(x_start + i);
-            }
-        }
-        return t;
-    }
-
-    std::vector<ValueAsString> underlying_source_variables() const override
-    {
-        std::vector<ValueAsString> vars;
-        for (int32_t y = 0; y < _format.h; ++y)
-        {
-            ValueAsString t;
-            t.str       = build_variable_name(y);
-            t.type.str  = get_cl_data_type(_format.dt, _format.w);
-            t.type.dt   = _format.dt;
-            t.type.size = _format.w;
-            vars.push_back(t);
-        }
-        return vars;
-    }
-
-    bool is_assignable() const override
-    {
-        return true;
-    }
-
-    bool need_declaration() const override
-    {
-        return true;
-    }
-
-private:
-    std::string build_variable_name(int32_t y) const
-    {
-        std::string var_name = _basename;
-
-        if (_format.h == 1)
-        {
-            return var_name;
-        }
-        else
-        {
-            var_name += "_";
-            var_name += std::to_string(y);
-        }
-
-        return var_name;
-    }
-
-    std::string to_scalar_hex(int32_t x) const
-    {
-        switch (x)
-        {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-            case 8:
-            case 9:
-                return std::to_string(x);
-            case 10:
-                return "A";
-            case 11:
-                return "B";
-            case 12:
-                return "C";
-            case 13:
-                return "D";
-            case 14:
-                return "E";
-            case 15:
-                return "F";
-            default:
-                std::cout << "Unsupported hexadecimal value" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-};
-
-// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
-class ClConstantTile : public IVectorTile
-{
-public:
-    ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        _format.w  = in[0].size();
-        _format.h  = in.size();
-        _format.dt = dt;
-
-        _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
-
-        for (int32_t y = 0; y < _format.h; ++y)
-        {
-            for (int32_t x = 0; x < _format.w; ++x)
-            {
-                _data[y][x] = in[y][x];
-            }
-        }
-    }
-
-    ValueAsString scalar(int32_t x, int32_t y) const override
-    {
-        x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = _data[y][x];
-        t.type.str  = get_cl_data_type(_format.dt, 1);
-        t.type.dt   = _format.dt;
-        t.type.size = 1;
-
-        return t;
-    }
-
-    ValueAsString vector(int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        return vector(0, _format.w, y);
-    }
-
-    ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
-    {
-        y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
-        ValueAsString t;
-        t.str       = "";
-        t.type.str  = get_cl_data_type(_format.dt, width);
-        t.type.dt   = _format.dt;
-        t.type.size = width;
-
-        if (width > 1)
-        {
-            t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
-        }
-
-        int32_t x = x_start;
-        for (; x < width - 1; ++x)
-        {
-            t.str += scalar(x, y).str;
-            t.str += ", ";
-        }
-        t.str += scalar(x, y).str;
-
-        if (width > 1)
-        {
-            t.str += "))";
-        }
-
-        return t;
-    }
-
-    std::vector<ValueAsString> underlying_source_variables() const override
-    {
-        std::vector<ValueAsString> vars;
-
-        for (int32_t y = 0; y < _format.h; ++y)
-        {
-            for (int32_t x = 0; x < _format.w; ++x)
-            {
-                ValueAsString t;
-                t.str       = _data[y][x];
-                t.type.str  = get_cl_data_type(_format.dt, 1);
-                t.type.dt   = _format.dt;
-                t.type.size = 1;
-                vars.push_back(t);
-            }
-        }
-
-        return vars;
-    }
-
-    bool is_assignable() const override
-    {
-        return false;
-    }
-
-    bool need_declaration() const override
-    {
-        return false;
-    }
-
-private:
-    std::vector<std::vector<std::string>> _data{};
-};
-
-enum class TensorComponentIndex : int32_t
-{
-    IndexMask = 0x0000000f,
-};
-
-enum class TensorComponentGroup : int32_t
-{
-    OffsetFirstElement = 0x00000100,
-    Stride             = 0x00001000,
-    Dimension          = 0x00010000,
-    FoldedDimension    = 0x00100000,
-    Constant           = 0x01000000
-};
-
-inline std::string to_string(TensorComponentType x)
-{
-    switch (x)
-    {
-        case TensorComponentType::Unknown:
-            return "Unknown";
-        case TensorComponentType::OffsetFirstElement:
-            return "OffsetFirstElement";
-        case TensorComponentType::Stride1:
-            return "Stride1";
-        case TensorComponentType::Stride2:
-            return "Stride2";
-        case TensorComponentType::Stride3:
-            return "Stride3";
-        case TensorComponentType::Stride4:
-            return "Stride4";
-        case TensorComponentType::Dim0:
-            return "Dim0";
-        case TensorComponentType::Dim1:
-            return "Dim1";
-        case TensorComponentType::Dim2:
-            return "Dim2";
-        case TensorComponentType::Dim3:
-            return "Dim3";
-        case TensorComponentType::Dim4:
-            return "Dim4";
-        case TensorComponentType::Dim1xDim2:
-            return "Dim1xDim2";
-        case TensorComponentType::Dim1xDim2xDim3:
-            return "Dim1xDim2xDim3";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-class ITensorArgument
-{
-public:
-    virtual ~ITensorArgument() = default;
-
-    /** Method to get the tensor component as a string
-     *
-     * @param[in] x tensor component to query
-     *
-     * @return  the tensor component as a string
-     */
-    virtual std::string component(TensorComponentType x) = 0;
-
-    /** Method to get the tensor component type declaration as a string
-     *
-     * @return  the tensor component type declaration as a string
-     */
-    virtual std::string component_type_declaration() const = 0;
-
-    /** Method to get the tensor component data type
-     *
-     * @return  the tensor component data type
-     */
-    virtual DataType component_data_type() const = 0;
-
-    /** Method to get the tensor component declarations
-     *
-     * @return a vector containing the tensor component declarations
-     */
-    virtual std::vector<TensorComponentType> component_declarations() const = 0;
-
-    /** Method to get the name of the tensor argument.
-     *
-     * @return the name of the tensor argument
-     */
-    std::string name() const
-    {
-        return _basename;
-    }
-
-    /** Method to get the tensor format
-     *
-     * @return the format
-     */
-    TensorInfo format() const
-    {
-        return _format;
-    }
-
-protected:
-    TensorInfo  _format{};
-    std::string _basename{};
-};
-
-enum class GpuTensorStorage : int32_t
-{
-    Unknown          = 0x0000,
-    BufferUint8Ptr   = 0x0012,
-    Image2dReadOnly  = 0x0020,
-    Image2dWriteOnly = 0x0021,
-    Image3dReadOnly  = 0x0030,
-    Image3dWriteOnly = 0x0031
-};
-
-inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
-{
-    switch (s)
-    {
-        case TensorStorageType::Unknown:
-            return GpuTensorStorage::Unknown;
-
-        case TensorStorageType::BufferUint8Ptr:
-            return GpuTensorStorage::BufferUint8Ptr;
-
-        case TensorStorageType::Texture2dReadOnly:
-            return GpuTensorStorage::Image2dReadOnly;
-
-        case TensorStorageType::Texture2dWriteOnly:
-            return GpuTensorStorage::Image2dWriteOnly;
-
-        default:
-            assert(false);
-            return GpuTensorStorage::Unknown;
-    }
-}
-
-inline TensorStorageType to_tensor_storage(GpuTensorStorage s)
-{
-    switch (s)
-    {
-        case GpuTensorStorage::Unknown:
-            return TensorStorageType::Unknown;
-
-        case GpuTensorStorage::BufferUint8Ptr:
-            return TensorStorageType::BufferUint8Ptr;
-
-        case GpuTensorStorage::Image2dReadOnly:
-            return TensorStorageType::Texture2dReadOnly;
-
-        case GpuTensorStorage::Image2dWriteOnly:
-            return TensorStorageType::Texture2dWriteOnly;
-
-        default:
-            assert(false);
-            return TensorStorageType::Unknown;
-    }
-}
-
-class IGpuTensorArgument : public ITensorArgument
-{
-public:
-    virtual ~IGpuTensorArgument() = default;
-
-    /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
-     *
-     * @param[in] x tensor storage to query
-     *
-     * @return  the tensor storage as a string
-     */
-    virtual std::string storage(GpuTensorStorage x) = 0;
-
-    /** Method to get the tensor storage type declaration as a string
-     *
-     * @param[in] x tensor component to query
-     *
-     * @return  the tensor storage type declaration as a string
-     */
-    virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
-
-    /** Method to get the tensor storage declarations
-     *
-     * @return a vector containing the tensor storage declarations
-     */
-    virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
-};
-
-class ClTensorArgument : public IGpuTensorArgument
-{
-public:
-    ClTensorArgument(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible)
-    {
-        _basename                      = name;
-        _format                        = x;
-        _return_by_value_when_possible = return_by_value_when_possible;
-    }
-
-    // Methods to override
-    std::string component(TensorComponentType x) override
-    {
-        if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
-        {
-            int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
-            return std::to_string(idx - 1);
-        }
-
-        if (_return_by_value_when_possible)
-        {
-            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
-            {
-                int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
-                return std::to_string(_format.shape[idx]);
-            }
-
-            if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
-            {
-                switch (x)
-                {
-                    case TensorComponentType::Dim1xDim2:
-                        return std::to_string(_format.shape[1] * _format.shape[2]);
-                    case TensorComponentType::Dim1xDim2xDim3:
-                        return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
-                    default:
-                        std::cout << "Unsupported folded dimension" << std::endl;
-                        assert(false);
-                }
-            }
-        }
-
-        if (std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
-        {
-            _components_required.push_back(x);
-        }
-
-        return build_component_name(x);
-    }
-
-    std::string component_type_declaration() const override
-    {
-        return "int";
-    };
-
-    DataType component_data_type() const override
-    {
-        return DataType::Int32;
-    }
-
-    std::string storage(GpuTensorStorage x) override
-    {
-        if (std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
-        {
-            _storage_required.push_back(x);
-        }
-
-        return build_storage_name(x);
-    }
-
-    std::string storage_type_declaration(GpuTensorStorage x) const override
-    {
-        switch (x)
-        {
-            case GpuTensorStorage::BufferUint8Ptr:
-                return "__global uchar*";
-            case GpuTensorStorage::Image2dReadOnly:
-                return "__read_only image2d_t";
-            case GpuTensorStorage::Image2dWriteOnly:
-                return "__write_only image2d_t";
-            case GpuTensorStorage::Image3dReadOnly:
-                return "__read_only image3d_t ";
-            case GpuTensorStorage::Image3dWriteOnly:
-                return "__write_only image3d_t ";
-            default:
-                std::cout << "Unsupported storage" << std::endl;
-                assert(false);
-                return "";
-        }
-    };
-
-    std::vector<GpuTensorStorage> storage_declarations() const override
-    {
-        return _storage_required;
-    }
-
-    std::vector<TensorComponentType> component_declarations() const override
-    {
-        return _components_required;
-    }
-
-private:
-    std::string build_storage_name(GpuTensorStorage x) const
-    {
-        std::string var_name = _basename;
-
-        switch (x)
-        {
-            case GpuTensorStorage::BufferUint8Ptr:
-                return var_name + "_ptr";
-            case GpuTensorStorage::Image2dReadOnly:
-            case GpuTensorStorage::Image2dWriteOnly:
-                return var_name + "_img2d";
-            case GpuTensorStorage::Image3dReadOnly:
-            case GpuTensorStorage::Image3dWriteOnly:
-                return var_name + "_img3d";
-            default:
-                std::cout << "Unsupported storage" << std::endl;
-                assert(false);
-        }
-
-        return var_name;
-    }
-
-    std::string build_component_name(TensorComponentType x) const
-    {
-        std::string var_name = _basename;
-
-        switch (x)
-        {
-            case TensorComponentType::OffsetFirstElement:
-                return var_name + "_offset_first_element";
-            case TensorComponentType::Stride1:
-                return var_name + "_stride1";
-            case TensorComponentType::Stride2:
-                return var_name + "_stride2";
-            case TensorComponentType::Stride3:
-                return var_name + "_stride3";
-            case TensorComponentType::Dim0:
-                return var_name + "_dim0";
-            case TensorComponentType::Dim1:
-                return var_name + "_dim1";
-            case TensorComponentType::Dim2:
-                return var_name + "_dim2";
-            case TensorComponentType::Dim3:
-                return var_name + "_dim3";
-            case TensorComponentType::Dim1xDim2:
-                return var_name + "_dim1xdim2";
-            case TensorComponentType::Dim1xDim2xDim3:
-                return var_name + "_dim1xdim2xdim3";
-            default:
-                std::cout << "Unsupported component" << std::endl;
-                assert(false);
-        }
-
-        return var_name;
-    }
-
-    bool                             _return_by_value_when_possible{false};
-    std::vector<GpuTensorStorage>    _storage_required{};
-    std::vector<TensorComponentType> _components_required{};
-};
-
-/**
- * @brief Data structure that contains the declared tiles by the components.
- * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
- * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
- * and remove (pop) all the tiles from the level above.
- * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
- * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tiles among different components.
- *
- */
-class GpuTileRegistry
-{
-public:
-    enum class RegistryTileType
-    {
-        Tile,
-        Link
-    };
-
-    using RegistryIdSpace  = int32_t;
-    using RegistryLevel    = int32_t;
-    using RegistryTileName = std::string;
-
-    struct RegistryTileTableEntry
-    {
-        RegistryLevel                registry_level{0};
-        std::unique_ptr<IVectorTile> tile_object{nullptr};
-    };
-
-    struct RegistryTileTypeTableEntry
-    {
-        RegistryTileType tile_type{RegistryTileType::Tile};
-        RegistryTileName tile_name{};
-        RegistryIdSpace  registry_idspace{0};
-        RegistryLevel    registry_level{0};
-    };
-
-    using RegistryTileTable     = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
-    using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
-
-    /**
-     * @brief Construct a new Gpu Tile Registry object
-     *
-     */
-    GpuTileRegistry()
-    {
-        _language = GpuTargetLanguage::Unknown;
-    }
-
-    /**
-     * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuTileRegistry(GpuTargetLanguage language)
-    {
-        _language = language;
-    }
-
-    /**
-     * @brief Default destructor. Destroy the Gpu Tile Registry object
-     *
-     */
-    ~GpuTileRegistry() = default;
-
-    /**
-     * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
-     *        Therefore, the IdSpace should be set before declaring any tiles.
-     *
-     * @param[in] id The IdSpace id
-     */
-    void set_IdSpace(int32_t id)
-    {
-        _IdSpace = id;
-    }
-
-    /**
-     * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
-     *
-     * @return The IdSpace id
-     */
-    int32_t IdSpace() const
-    {
-        return _IdSpace;
-    }
-
-    /**
-     * @brief Gets all the IdSpace declarations defined in the tile registry.
-     *
-     * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
-     */
-    std::vector<int32_t> IdSpace_declarations() const
-    {
-        std::vector<int32_t> x;
-
-        auto it = _frags.begin();
-
-        while (it != _frags.end())
-        {
-            x.push_back(it->first);
-
-            it++;
-        }
-
-        return x;
-    }
-
-    /**
-     * @brief Declare a tile from a previously created tile
-     */
-    void insert(const std::string &name, const IVectorTile *frag)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-        const std::string var_name     = frag->name();
-        TileInfo          format       = frag->format();
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
-
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Link;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-
-    /**
-     * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
-     *
-     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
-     *
-     * @param[in] name   Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
-     * @param[in] format Tile format use to use
-     */
-    void insert(const std::string &name, const TileInfo &format)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-        const std::string var_name     = generate_tile_name(name);
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClTile> tile                     = std::make_unique<ClTile>(var_name, format);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-
-    /**
-     * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
-     *
-     * @note The reference name used for declaring the tile should not be previously used in the IdSpace
-     *
-     * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
-     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
-     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
-     *                 that the data type is aligned with the content of the std::string.
-     */
-    void insert(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-    }
-
-    /**
-     * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
-     *
-     * @note This method can be used to declare temporary tiles that need to be accessed only once.
-     *
-     * @param[in] in   A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
-     * @param[in] dt   The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
-     *                 that the data type is aligned with what passed with the std::string.
-     *
-     * @return IVectorTile* the anonymous constant tile
-     */
-    IVectorTile *insert(const std::vector<std::vector<std::string>> &in, DataType dt)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
-
-        // First check whether a tile with the same name exists
-        IVectorTile *result = (*this)[key_var_name];
-        assert(result == nullptr);
-        if (result == nullptr)
-        {
-            std::unique_ptr<ClConstantTile> tile             = std::make_unique<ClConstantTile>(in, dt);
-            _frags[key_IdSpace][key_var_name].tile_object    = std::move(tile);
-            _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
-            _frag_types[key_IdSpace][key_var_name].tile_type        = RegistryTileType::Tile;
-            _frag_types[key_IdSpace][key_var_name].tile_name        = key_var_name;
-            _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
-            _frag_types[key_IdSpace][key_var_name].registry_level   = _registry_level;
-        }
-
-        return (*this)[key_var_name];
-    }
-
-    /**
-     * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
-     *
-     * @param[in] name         The name of the tile to retrieve
-     * @param[in] IdSpace The IdSpace id where to search the tile
-     *
-     * @return IVectorTile* The tile
-     */
-    IVectorTile *get(const std::string &name, int32_t IdSpace)
-    {
-        const int32_t     key_IdSpace  = IdSpace;
-        const std::string key_var_name = name;
-
-        IVectorTile *result         = nullptr;
-        auto         search_IdSpace = _frags.find(key_IdSpace);
-        if (search_IdSpace != _frags.end())
-        {
-            auto search_tile = _frags[key_IdSpace].find(key_var_name);
-            if (search_tile != _frags[key_IdSpace].end())
-            {
-                result = search_tile->second.tile_object.get();
-                assert(result != nullptr);
-            }
-        }
-
-        return result;
-    }
-
-    /**
-     * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
-     *
-     * @param[in] name The name of the tile to retrieve
-     *
-     * @return IVectorTile* The tile
-     */
-    IVectorTile *operator[](const std::string &name)
-    {
-        return get(name, _IdSpace);
-    }
-
-    /**
-     * @brief Check whether the tile in the in the IdSpace provided by the user exists
-     *
-     * @param[in] name         Name of the tile to search for
-     * @param[in] IdSpace The IdSpace id where to search the tile
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tile(const std::string &name, int32_t IdSpace) const
-    {
-        const int32_t     key_IdSpace  = IdSpace;
-        const std::string key_var_name = name;
-
-        // IVectorTile* result = nullptr;
-        auto search_IdSpace = _frags.find(key_IdSpace);
-
-        return search_IdSpace != _frags.end();
-    }
-
-    /**
-     * @brief Check whether the tile within the current IdSpace exists
-     *
-     * @param[in] name Name of the tile to search for
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tile(const std::string &name) const
-    {
-        return has_tile(name, _IdSpace);
-    }
-
-    /**
-     * @brief Get all the tiles declared within the IdSpace provided by the user
-     *
-     * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
-     *
-     * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
-     */
-    std::vector<IVectorTile *> tile_declarations(int32_t IdSpace)
-    {
-        std::vector<IVectorTile *> tiles;
-
-        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
-
-        while (it != _frag_types[IdSpace].end())
-        {
-            // The following line should be enabled. However, we cannot at this stage
-            // because it used to retrieve the output tile produced by each component.
-            // However, this method should NOT be used to retrieve the output tile
-            //if(it->second.tile_type == RegistryTileType::Tile)
-            {
-                tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
-            }
-            it++;
-        }
-
-        return tiles;
-    }
-
-    /**
-     * @brief Increase the level of stack.
-     *
-     */
-    void increment_registry_level()
-    {
-        _registry_level++;
-    }
-
-    /**
-     * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
-     *
-     */
-    void decrement_registry_level()
-    {
-        assert(_registry_level >= 0);
-
-        // Remove all variables in the local scope
-        std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
-
-        while (it != _frags[_IdSpace].end())
-        {
-            if (it->second.registry_level == _registry_level)
-            {
-                it = _frags[_IdSpace].erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-
-        std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
-
-        while (it_type != _frag_types[_IdSpace].end())
-        {
-            if (it_type->second.registry_level == _registry_level)
-            {
-                it_type = _frag_types[_IdSpace].erase(it_type);
-            }
-            else
-            {
-                it_type++;
-            }
-        }
-
-        _registry_level--;
-    }
-
-    /**
-     * @brief Get the level of the stack
-     *
-     */
-    int32_t level() const
-    {
-        return _registry_level;
-    }
-
-private:
-    // This method ensures that the key is unique among different components
-    std::string generate_tile_name(const std::string &name)
-    {
-        assert(_IdSpace >= 0);
-        if (_registry_level == 0)
-        {
-            return "_G" + std::to_string(_IdSpace) + "_" + name;
-        }
-        else
-        {
-            return name;
-        }
-    }
-
-    RegistryTileTable     _frags{};
-    RegistryTileTypeTable _frag_types{};
-    RegistryLevel         _registry_level{0};
-    RegistryIdSpace       _IdSpace{-1};
-    int32_t               _anonymous_frag_count{0};              // Counter used to create the anonymous tiles
-    GpuTargetLanguage     _language{GpuTargetLanguage::Unknown}; // Gpu programming language
-};
-
-using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
-
-/**
- * @brief Data structure that contains the tensors consumed by the components.
- * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tensors among different components.
- *
- */
-class GpuTensorArgumentRegistry
-{
-public:
-    /**
-     * @brief Construct a new Gpu Tensor Registry object
-     *
-     */
-    GpuTensorArgumentRegistry()
-    {
-        _language = GpuTargetLanguage::Unknown;
-    }
-
-    /**
-     * @brief Construct a new Gpu Tensor Registry object
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuTensorArgumentRegistry(GpuTargetLanguage language)
-    {
-        _language = language;
-    }
-
-    /**
-     * @brief Default destructor. Destroy the Gpu Tensor Registry object
-     *
-     */
-    ~GpuTensorArgumentRegistry() = default;
-
-    /**
-     * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
-     *        Therefore, the IdSpace should be set before declaring any tensors.
-     *
-     * @param[in] id The IdSpace id
-     */
-    void set_IdSpace(int32_t id)
-    {
-        _IdSpace = id;
-    }
-
-    /**
-     * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
-     *
-     * @return The IdSpace id
-     */
-    int32_t IdSpace() const
-    {
-        return _IdSpace;
-    }
-
-    /**
-     * @brief Gets all the IdSpace declarations defined in the tensor registry.
-     *
-     * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
-     */
-    std::vector<int32_t> IdSpace_declarations() const
-    {
-        std::vector<int32_t> x;
-
-        auto it = _refs.begin();
-
-        while (it != _refs.end())
-        {
-            x.push_back(it->first);
-
-            it++;
-        }
-
-        return x;
-    }
-
-    /**
-     * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
-     *
-     * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
-     *
-     * @param[in] name                          Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
-     * @param[in] x                             Pair of tensor info and tensor id
-     * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
-     */
-    void insert(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible)
-    {
-        assert(_language == GpuTargetLanguage::OpenCL);
-        const int32_t     key_IdSpace  = _IdSpace;
-        const int32_t     tensor_id    = x.id;
-        const std::string key_var_name = name;
-        const std::string var_name     = generate_tensor_name(name, tensor_id);
-
-        // First, check whether the tensor has already a reference. If so, trigger an assert
-        assert(!has_tensor_argument(name));
-
-        // Check whether a tensor with that tensorID exists
-        auto result = _tensor_arguments.find(tensor_id);
-        if (result == _tensor_arguments.end())
-        {
-            // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
-            std::unique_ptr<ClTensorArgument> arg =
-                std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
-            _tensor_arguments[tensor_id] = std::move(arg);
-        }
-
-        _refs[key_IdSpace][key_var_name] = tensor_id;
-    }
-
-    /**
-     * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
-     *
-     * @param[in] name The name of the tensor to retrieve
-     *
-     * @return IGpuTensor* The tensor
-     */
-    IGpuTensorArgument *operator[](const std::string &name)
-    {
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        IGpuTensorArgument *result         = nullptr;
-        auto                search_IdSpace = _refs.find(key_IdSpace);
-        if (search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            if (search_tensor_id != _refs[key_IdSpace].end())
-            {
-                const int32_t tensor_id              = search_tensor_id->second;
-                auto          search_tensor_argument = _tensor_arguments.find(tensor_id);
-                if (search_tensor_argument != _tensor_arguments.end())
-                {
-                    result = search_tensor_argument->second.get();
-                }
-                assert(result != nullptr);
-            }
-        }
-
-        return result;
-    }
-
-    /**
-     * @brief Get all the tensors declared in the IdSpace provided by the user
-     *
-     * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
-     */
-    std::vector<IGpuTensorArgument *> tensor_argument_declarations()
-    {
-        std::vector<IGpuTensorArgument *> args;
-
-        auto it = _tensor_arguments.begin();
-
-        while (it != _tensor_arguments.end())
-        {
-            args.push_back(it->second.get());
-            it++;
-        }
-
-        return args;
-    }
-
-    /**
-     * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
-     *
-     * @param[in] name Name of the tensor argument to search for
-     *
-     * @return true if the tensor argument exists
-     * @return false if the tensor argument does not exist
-     */
-    bool has_tensor_argument(const std::string &name)
-    {
-        const int32_t     key_IdSpace  = _IdSpace;
-        const std::string key_var_name = name;
-
-        auto search_IdSpace = _refs.find(key_IdSpace);
-
-        if (search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            return search_tensor_id != _refs[key_IdSpace].end();
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-    /**
-     * @brief Check whether the tensor argument is in the the IdSpace provided by the user
-     *
-     * @param[in] name    Name of the tensor argument to search for
-     * @param[in] IdSpace The IdSpace id where to search the tensor argument
-     *
-     * @return true if the tile exists
-     * @return false if the tile does not exist
-     */
-    bool has_tensor_argument(const std::string &name, int32_t IdSpace)
-    {
-        const int32_t     key_IdSpace  = IdSpace;
-        const std::string key_var_name = name;
-
-        auto search_IdSpace = _refs.find(key_IdSpace);
-
-        if (search_IdSpace != _refs.end())
-        {
-            auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
-            return search_tensor_id != _refs[key_IdSpace].end();
-        }
-        else
-        {
-            return false;
-        }
-    }
-
-private:
-    // This method ensures that the key is unique among different components
-    std::string generate_tensor_name(const std::string &name, int32_t tensor_id)
-    {
-        assert(tensor_id >= 0);
-
-        return name + std::to_string(tensor_id);
-    }
-
-    std::map<int32_t, TensorEntry>                    _tensor_arguments{};
-    std::map<int32_t, std::map<std::string, int32_t>> _refs{};
-    int32_t                                           _IdSpace{-1};
-    GpuTargetLanguage                                 _language{GpuTargetLanguage::Unknown}; // Gpu programming language
-};
-
-enum class OpType : int32_t
-{
-    Elementwise = 0x0000,
-    Relational  = 0x1000,
-    Algebra     = 0x2000
-};
-
-inline std::string to_string(AssignmentOp op)
-{
-    switch (op)
-    {
-        case AssignmentOp::Decrement:
-            return "-=";
-        case AssignmentOp::Increment:
-            return "+=";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string to_string(UnaryOp op)
-{
-    switch (op)
-    {
-        case UnaryOp::LogicalNot:
-            return "!";
-        case UnaryOp::BitwiseNot:
-            return "~";
-        case UnaryOp::Negate:
-            return "-";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string to_string(BinaryOp op)
-{
-    switch (op)
-    {
-        case BinaryOp::Add:
-            return "+";
-        case BinaryOp::Sub:
-            return "-";
-        case BinaryOp::Mul:
-            return "*";
-        case BinaryOp::Div:
-            return "/";
-        case BinaryOp::Mod:
-            return "%";
-        case BinaryOp::Equal:
-            return "==";
-        case BinaryOp::Less:
-            return "<";
-        case BinaryOp::LessEqual:
-            return "<=";
-        case BinaryOp::Greater:
-            return ">";
-        case BinaryOp::GreaterEqual:
-            return ">=";
-        case BinaryOp::LogicalAnd:
-            return "&&";
-        case BinaryOp::LogicalOr:
-            return "||";
-        case BinaryOp::BitwiseXOR:
-            return "^";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-inline std::string binary_op_string(BinaryOp op)
-{
-    switch (op)
-    {
-        case BinaryOp::Add:
-            return "add";
-        case BinaryOp::Sub:
-            return "sub";
-        case BinaryOp::Mul:
-            return "mul";
-        case BinaryOp::Div:
-            return "div";
-        case BinaryOp::Mod:
-            return "mod";
-        case BinaryOp::Equal:
-            return "eq";
-        case BinaryOp::Less:
-            return "gt";
-        case BinaryOp::LessEqual:
-            return "gteq";
-        case BinaryOp::Greater:
-            return "lt";
-        case BinaryOp::GreaterEqual:
-            return "lte";
-        default:
-            assert(false);
-            return "";
-    }
-}
-
-enum class OperandType : int32_t
-{
-    Unknown              = 0x00000000,
-    ScalarFp32           = 0x00001011, // Immediate scalar tile
-    ScalarFp16           = 0x00001012, // Immediate scalar tile
-    ScalarInt32          = 0x00001021, // Immediate scalar tile
-    ScalarInt16          = 0x00001022, // Immediate scalar tile
-    ScalarInt8           = 0x00001024, // Immediate scalar tile
-    ScalarUInt32         = 0x00001031, // Immediate scalar tile
-    ScalarUInt16         = 0x00001032, // Immediate scalar tile
-    ScalarUInt8          = 0x00001034, // Immediate scalar tile
-    ScalarBool           = 0x00001041, // Immediate scalar tile
-    ScalarTile           = 0x00001050, // Scalar from a tile
-    Tile                 = 0x00010000, // Tile
-    TensorStride1        = 0x00100001, // Tensor component
-    TensorStride2        = 0x00100002, // Tensor component
-    TensorStride3        = 0x00100003, // Tensor component
-    TensorStride4        = 0x00100004, // Tensor component
-    TensorDim0           = 0x00100010, // Tensor component
-    TensorDim1           = 0x00100020, // Tensor component
-    TensorDim2           = 0x00100030, // Tensor component
-    TensorDim3           = 0x00100040, // Tensor component
-    TensorDim4           = 0x00100050, // Tensor component
-    TensorC              = 0x00100010, // Tensor component
-    TensorW              = 0x00100020, // Tensor component
-    TensorH              = 0x00100030, // Tensor component
-    TensorD              = 0x00100040, // Tensor component
-    TensorN              = 0x00100050, // Tensor component
-    TensorDim1xDim2      = 0x00100100, // Tensor component
-    TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
-    TensorWxH            = 0x00100300, // Tensor component
-    TensorWxHxD          = 0x00100400, // Tensor component
-    TensorDataOffset     = 0x00100500, // Tensor component
-};
-
-struct ScalarTileCoord
-{
-    ScalarTileCoord()
-    {
-    }
-
-    ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0)
-    {
-    }
-
-    int32_t x{-1};
-    int32_t y{-1};
-};
-
-/**
- * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
- * Operand can be of three types:
- * -# Scalar immediate: constant expression
- * -# Tile: A tile
- * -# Tensor component: A component (scalar) of a tensor
- *
- */
-class Operand
-{
-public:
-    Operand(const std::string &val)
-    {
-        _str  = val;
-        _type = OperandType::Tile;
-    }
-
-    Operand(const std::string &val, const ScalarTileCoord &coord)
-    {
-        _str   = val;
-        _type  = OperandType::ScalarTile;
-        _coord = coord;
-    }
-
-    Operand(const std::string &val, OperandType type)
-    {
-        _str  = val;
-        _type = type;
-    }
-
-    Operand(const Operand &t)
-    {
-        _str  = t.value();
-        _type = t.type();
-    }
-
-    Operand &operator=(const Operand &t)
-    {
-        _str   = t.value();
-        _type  = t.type();
-        _coord = t.scalar_tile_coordinate();
-        return *this;
-    }
-
-    std::string value() const
-    {
-        return _str;
-    }
-
-    OperandType type() const
-    {
-        return _type;
-    }
-
-    ScalarTileCoord scalar_tile_coordinate() const
-    {
-        return _coord;
-    }
-
-private:
-    std::string     _str{};
-    OperandType     _type{OperandType::Unknown};
-    ScalarTileCoord _coord{};
-};
-
-using GpuSamplerTensorStorage = GpuTensorStorage;
-
-struct GpuSampler
-{
-    GpuSampler() = default;
-
-    TensorSamplerFormat       format{TensorSamplerFormat::Unknown};
-    GpuSamplerTensorStorage   storage{GpuSamplerTensorStorage::Unknown};
-    TensorSamplerAddressModeX address_mode_x{TensorSamplerAddressModeX::Unknown};
-    TensorSamplerAddressModeY address_mode_y{TensorSamplerAddressModeY::Unknown};
-    TensorSamplerAddressModeZ address_mode_z{TensorSamplerAddressModeZ::Unknown};
-};
-
-inline GpuSampler create_simple_sampler(
-    const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
-{
-    CKW_UNUSED(step_x, step_y, step_z);
-
-    auto tensor = tensor_info_id->shape;
-
-    GpuSampler dst_sampler;
-    dst_sampler.format         = sampler.format;
-    dst_sampler.storage        = GpuSamplerTensorStorage::BufferUint8Ptr;
-    dst_sampler.address_mode_x = sampler.address_mode_x;
-    dst_sampler.address_mode_y = sampler.address_mode_y;
-    dst_sampler.address_mode_z = sampler.address_mode_z;
-
-    int32_t dim_x = 0;
-    int32_t dim_y = 0;
-    int32_t dim_z = 0;
-
-    switch (sampler.format)
-    {
-        case TensorSamplerFormat::C_W_H:
-            dim_x = tensor[0];
-            dim_y = tensor[1];
-            dim_z = tensor[2];
-            break;
-        case TensorSamplerFormat::C_WH_1:
-            dim_x = tensor[0];
-            dim_y = tensor[1] * tensor[2];
-            dim_z = 1;
-            break;
-        default:
-            std::cout << "Unsupported tensor format" << std::endl;
-            assert(false);
-            break;
-    }
-
-    if (dim_x == 1)
-    {
-        assert(step_x == 1);
-        dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
-    }
-
-    if (dim_y == 1)
-    {
-        assert(step_y == 1);
-        dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
-    }
-
-    if (dim_z == 1)
-    {
-        assert(step_z == 1);
-        dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-    }
-
-    return dst_sampler;
-}
-
-class GpuOutputSampler
-{
-public:
-    GpuOutputSampler() = default;
-
-    /**
-     * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
-     *        by the root component. Once initialized, all simpler components will need to used this sampler
-     *        or a broadcasted version of it
-     *
-     * @param[in] sampler GpuSampler
-     * @param[in] step_x  Increment step in the X direction. Not necessarily it is the same of n0 of tile!
-     * @param[in] step_y  Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
-     * @param[in] step_z  Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
-     */
-    void initialize(const TensorInfo       *tensor_info_id,
-                    GpuSamplerTensorStorage tensor_storage,
-                    TensorSamplerFormat     tensor_format,
-                    int32_t                 step_x,
-                    int32_t                 step_y,
-                    int32_t                 step_z)
-    {
-        assert(_is_initialized == false);
-
-        _step_x         = step_x;
-        _step_y         = step_y;
-        _step_z         = step_z;
-        _tensor_info_id = tensor_info_id;
-        _sampler        = create_sampler(tensor_storage, tensor_format);
-        _is_initialized = true;
-    };
-
-    GpuSampler sampler() const
-    {
-        return _sampler;
-    };
-
-    int32_t step_x() const
-    {
-        return _step_x;
-    };
-
-    int32_t step_y() const
-    {
-        return _step_y;
-    };
-
-    int32_t step_z() const
-    {
-        return _step_z;
-    };
-
-private:
-    GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
-    {
-        // Output can only be in output mode
-        assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
-        assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
-
-        auto tensor = _tensor_info_id->shape;
-
-        GpuSampler sampler;
-        sampler.format         = tensor_format;
-        sampler.storage        = tensor_storage;
-        sampler.address_mode_x = TensorSamplerAddressModeX::None;
-        sampler.address_mode_y = TensorSamplerAddressModeY::None;
-        sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-
-        // In the case of texture, we do not need any special checks at the border
-        if (tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
-        {
-            int32_t dim_x = 0;
-            int32_t dim_y = 0;
-            int32_t dim_z = 0;
-
-            switch (tensor_format)
-            {
-                case TensorSamplerFormat::C_W_H:
-                    dim_x = tensor[0];
-                    dim_y = tensor[1];
-                    dim_z = tensor[2];
-                    break;
-                case TensorSamplerFormat::C_WH_1:
-                    dim_x = tensor[0];
-                    dim_y = tensor[1] * tensor[2];
-                    dim_z = 1;
-                    break;
-                default:
-                    std::cout << "Unsupported tensor format" << std::endl;
-                    assert(false);
-                    break;
-            }
-
-            if ((dim_x % _step_x) != 0 && dim_x != 1)
-            {
-                sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
-            }
-
-            if ((dim_y % _step_y) != 0 && dim_y != 1)
-            {
-                sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
-            }
-
-            if ((dim_z % _step_z) != 0 && dim_z != 1)
-            {
-                sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
-            }
-        }
-
-        return sampler;
-    }
-
-    GpuSampler        _sampler{}; // GpuSampler
-    int32_t           _step_x{1};
-    int32_t           _step_y{1};
-    int32_t           _step_z{1};
-    const TensorInfo *_tensor_info_id{nullptr};
-    bool              _is_initialized{false};
-};
-
-/**
- * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
- */
-class TensorOperand
-{
-public:
-    TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
-    {
-    }
-
-    TensorOperand &operator=(const TensorOperand &t)
-    {
-        _str     = t.value();
-        _sampler = t.sampler();
-        return *this;
-    }
-
-    std::string value() const
-    {
-        return _str;
-    }
-
-    GpuSampler sampler() const
-    {
-        return _sampler;
-    }
-
-private:
-    std::string _str{};
-    GpuSampler  _sampler{};
-};
-
-/**
- * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
- *        This data structure must be initialized before being passed to the Gpu Kernel Writer
- *
- */
-class GpuKernelWriterDataHolder
-{
-public:
-    /**
-     * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
-     *        the GPU target and target specific capabilities (extensions). For now, we just initialize the
-     *        programming language
-     *
-     * @param[in] language Gpu programming language to use
-     */
-    GpuKernelWriterDataHolder(GpuTargetLanguage language)
-        : tiles(language), arguments(language), code(""), _language(language)
-    {
-    }
-
-    /**
-     * @brief Get the Gpu programming language used
-     *
-     * @return GpuTargetLanguage the Gpu programming language
-     */
-    GpuTargetLanguage programming_language() const
-    {
-        return _language;
-    }
-
-    /**
-     * @brief @ref GpuTileRegistry
-     *
-     */
-    GpuTileRegistry tiles{};
-    /**
-     * @brief @ref GpuTensorArgumentRegistry
-     *
-     */
-    GpuTensorArgumentRegistry arguments{};
-    /**
-     * @brief @ref GpuOutputSampler.
-     *
-     */
-    GpuOutputSampler output_sampler{};
-    /**
-     * @brief Source code
-     *
-     */
-    std::string code{};
-
-    // GpuExtensionRegistry extensions{};
-private:
-    GpuTargetLanguage _language;
-};
-
-struct LWS
-{
-    int32_t x{1};
-    int32_t y{1};
-    int32_t z{1};
-};
-
-/**
- * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
- *        declare an anonymous tile in the tile registry.
- */
-class OperandUnpacker
-{
-public:
-    OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) : _tiles(tiles), _arguments(arguments)
-    {
-        // Increase the level of the stack to allocate possible temporary tiles
-        _tiles.increment_registry_level();
-    };
-
-    ~OperandUnpacker()
-    {
-        // Decrease the level of the stack to deallocate any temporary tiles
-        _tiles.decrement_registry_level();
-    }
-
-    IVectorTile *unpack(const Operand &src)
-    {
-        // Get the tile
-        if (src.type() == OperandType::Tile)
-        {
-            assert(_tiles.has_tile(src.value()));
-            return _tiles[src.value()];
-        }
-        // Create an anonymous tile with a constant
-        else if (static_cast<int32_t>(src.type()) & 0x00001000)
-        {
-            if (src.type() == OperandType::ScalarTile)
-            {
-                ScalarTileCoord coord = src.scalar_tile_coordinate();
-                assert(_tiles.has_tile(src.value()));
-                assert(coord.x >= 0);
-                assert(coord.y >= 0);
-                auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
-                return _tiles.insert({{{val.str}}}, val.type.dt);
-            }
-            else
-            {
-                return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
-            }
-        }
-        // Create an anonymous tile with the tensor component
-        else
-        {
-            assert(_arguments.has_tensor_argument(src.value()));
-            auto              x   = _arguments[src.value()];
-            const std::string val = x->component(to_tensor_component(src.type()));
-            const DataType    dt  = x->component_data_type();
-            return _tiles.insert({{{val}}}, dt);
-        }
-    }
-
-private:
-    DataType to_tile_data_type(OperandType x)
-    {
-        return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
-    }
-
-    TensorComponentType to_tensor_component(OperandType x)
-    {
-        switch (x)
-        {
-            case OperandType::TensorDim0:
-                return TensorComponentType::Dim0;
-            case OperandType::TensorDim1:
-                return TensorComponentType::Dim1;
-            case OperandType::TensorDim2:
-                return TensorComponentType::Dim2;
-            case OperandType::TensorDim3:
-                return TensorComponentType::Dim3;
-            case OperandType::TensorDim4:
-                return TensorComponentType::Dim4;
-            case OperandType::TensorStride1:
-                return TensorComponentType::Stride1;
-            case OperandType::TensorStride2:
-                return TensorComponentType::Stride2;
-            case OperandType::TensorStride3:
-                return TensorComponentType::Stride3;
-            case OperandType::TensorStride4:
-                return TensorComponentType::Stride4;
-            case OperandType::TensorDim1xDim2:
-                return TensorComponentType::Dim1xDim2;
-            case OperandType::TensorDim1xDim2xDim3:
-                return TensorComponentType::Dim1xDim2xDim3;
-            case OperandType::TensorDataOffset:
-                return TensorComponentType::OffsetFirstElement;
-            default:
-                assert(false);
-                return TensorComponentType::Unknown;
-        }
-    }
-
-    GpuTileRegistry           &_tiles;
-    GpuTensorArgumentRegistry &_arguments;
-};
-
-/**
- * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
- *        declare an anonymous tile in the tile registry.
- *        Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
- */
-class TensorOperandUnpacker
-{
-public:
-    TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) : _arguments(arguments){};
-
-    IGpuTensorArgument *unpack(const TensorOperand &src)
-    {
-        assert(_arguments.has_tensor_argument(src.value()));
-        return _arguments[src.value()];
-    }
-
-private:
-    GpuTensorArgumentRegistry &_arguments;
-};
-
-/**
- * @brief The GpuKernel will be used in three occasions (stages):
- * #- Compilation stage
- * #- Tuning stage
- * #- Dispatch stage
- */
-struct GpuKernel
-{
-    // Compilation stage
-    std::string                code{};            // Source code, required for the compilation stage
-    std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
-    // Tuning stage
-    std::string      config_id{}; // Unique id, required for the tuning stage
-    std::vector<LWS> list_lws{};  // LWS to test, required for the tuning stage
-    // Dispatch stage
-    GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
-    std::vector<std::pair<int32_t, GpuTensorStorage>>
-        list_tensor_storages; // List of tensor storages, required for the dispatch stage
-    std::vector<std::pair<int32_t, TensorComponentType>>
-        list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
-};
-
-// Generate all extension pragmas (hardcoded for now)
-inline std::string generate_extensions()
-{
-    std::string ext = R"(
-#if defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(cl_khr_fp16)
-
-#if defined(cl_arm_integer_dot_product_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(cl_arm_integer_dot_product_int8)
-
-#if defined(cl_arm_integer_dot_product_accumulate_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#if defined(cl_arm_printf)
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf);
-)";
-    return ext;
-}
-
-// This function should produce an object with the source
-inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string &name)
-{
-    std::string code;
-    code += generate_extensions();
-    code += "__kernel void ";
-    code += name;
-    code += "(\n";
-
-    auto IdSpaces = in.arguments.IdSpace_declarations();
-
-    std::vector<std::string> arg_str;
-
-    auto tensor_args = in.arguments.tensor_argument_declarations();
-
-    for (auto &i : tensor_args)
-    {
-        // For each tensor used, get the storage and tensor components
-        auto storages   = i->storage_declarations();
-        auto components = i->component_declarations();
-
-        for (auto &y : storages)
-        {
-            std::string str;
-            str += i->storage_type_declaration(y);
-            str += " ";
-            str += i->storage(y);
-            arg_str.push_back(str);
-        }
-
-        for (auto &y : components)
-        {
-            std::string str;
-            str += i->component_type_declaration();
-            str += " ";
-            str += i->component(y);
-            arg_str.push_back(str);
-        }
-    }
-
-    for (size_t i = 0; i < arg_str.size(); ++i)
-    {
-        code += arg_str[i];
-        if (i + 1 < arg_str.size())
-        {
-            code += ",\n";
-        }
-    }
-
-    code += ")\n";
-    code += "{\n";
-    code += in.code;
-    code += "}\n";
-
-    return code;
-}
-
-/**
- * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
- * how to reduce the dimensionality of a tensor
- *
- */
-class GpuTensor3dMapper
-{
-public:
-    GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor){};
-
-    std::string tensor_component_x() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Dim0);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_y() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return _tensor->component(TensorComponentType::Dim1xDim2);
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Dim1);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_z() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return "1";
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Dim2);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_y() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Stride1);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_z() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return "0";
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Stride2);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string tensor_component_stride_batch() const
-    {
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return _tensor->component(TensorComponentType::Stride3);
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_x() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[0] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_y() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return (t.shape[1] * t.shape[2]) == 1;
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[1] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_z() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-                return true;
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[2] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    bool is_one_component_batch() const
-    {
-        auto       t      = _tensor->format();
-        const auto format = _sampler.format;
-        switch (format)
-        {
-            case TensorSamplerFormat::C_WH_1:
-            case TensorSamplerFormat::C_W_H:
-                return t.shape[3] == 1;
-            default:
-                std::cout << "Unsupported tensor format" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    GpuSampler gpu_sampler() const
-    {
-        return _sampler;
-    }
-
-    IGpuTensorArgument *tensor_argument() const
-    {
-        return _tensor;
-    }
-
-private:
-    GpuSampler          _sampler;
-    IGpuTensorArgument *_tensor;
-};
-
-struct GpuKernelWriterAttribute
-{
-    bool return_tensor_component_by_value{false};
-};
-
-enum class RoundingMode
-{
-    None,
-    Rte,
-    Rtz,
-    Rtp,
-    Rtn
-};
-
-// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
-class IGpuKernelWriter
-{
-public:
-    virtual ~IGpuKernelWriter() = default;
-
-    virtual void set_IdSpace(int32_t id) = 0;
-
-    virtual void import_tile(const std::string &dst, const IVectorTile *src) = 0;
-
-    virtual void declare_argument(const std::string &name, const TensorInfo &tensor) = 0;
-
-    virtual void declare_tile(const std::string &name, const TileInfo &info) = 0;
-
-    virtual void
-    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
-
-    virtual void write_text(const std::string &x) = 0;
-
-    virtual void compound_statement_begin() = 0;
-
-    virtual void compound_statement_end() = 0;
-
-    // Operations
-    virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0;
-
-    virtual void
-    op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0;
-
-    virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0;
-
-    virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0;
-
-    virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0;
-
-    virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
-    virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0;
-
-    virtual void
-    op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0;
-
-    virtual void op_binary_elementwise_function(const Operand &dst_name,
-                                                BinaryFunction func,
-                                                const Operand &first_name,
-                                                const Operand &second_name) = 0;
-
-    virtual void op_ternary_elementwise_function(const Operand  &dst_name,
-                                                 TernaryFunction func,
-                                                 const Operand  &first_name,
-                                                 const Operand  &second_name,
-                                                 const Operand  &third_name) = 0;
-
-    virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
-    virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
-    virtual void op_else_header() = 0;
-
-    virtual void op_for_loop_header(const Operand &var_name,
-                                    BinaryOp       cond_op,
-                                    const Operand &cond_value,
-                                    const Operand &update_var,
-                                    AssignmentOp   update_op,
-                                    const Operand &update_value) = 0;
-
-    virtual void op_load_indirect(const TensorOperand &tensor,
-                                  const Operand       &dst,
-                                  const Operand       &x,
-                                  const Operand       &y_indirect,
-                                  const Operand       &z,
-                                  const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
-
-    virtual void op_load_immediate(const TensorOperand &tensor,
-                                   const Operand       &dst,
-                                   const Operand       &x,
-                                   const Operand       &y,
-                                   const Operand       &z,
-                                   const Operand       &b          = Operand("0", OperandType::ScalarInt32),
-                                   const Operand       &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
-
-    virtual void op_store_immediate(const TensorOperand &tensor,
-                                    const Operand       &src,
-                                    const Operand       &x,
-                                    const Operand       &y,
-                                    const Operand       &z,
-                                    const Operand       &b = Operand("0", OperandType::ScalarInt32)) = 0;
-
-    virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0;
-
-    virtual void op_return() = 0;
-
-    // Utils
-    // It is the process of converting
-    virtual void util_get_indirect_buffer(const Operand       &dst,
-                                          const TensorOperand &tensor,
-                                          const Operand       &x,
-                                          const Operand       &y,
-                                          const Operand       &x_off,
-                                          const Operand       &y_off) = 0;
-};
-
-enum class GpuLoadStoreType
-{
-    Load  = 1,
-    Store = 2
-};
-
-class IGpuLoadStoreHelperWriter
-{
-public:
-    IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type)
-        : _writer(x), _mapper(mapper), _type(type)
-    {
-    }
-
-    IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
-
-    IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
-
-    virtual ~IGpuLoadStoreHelperWriter() = default;
-
-    virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
-
-    virtual void write(const std::pair<int32_t, std::string> &y) = 0;
-
-    virtual void finalize() = 0;
-
-protected:
-    IGpuKernelWriter *_writer;
-    GpuTensor3dMapper _mapper;
-    GpuLoadStoreType  _type;
-};
-
-class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
-    ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
-        : IGpuLoadStoreHelperWriter(x, mapper, type)
-    {
-    }
-
-    ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
-
-    ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
-
-    static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
-    {
-        CKW_UNUSED(x, type, dst);
-
-        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
-        {
-            return false;
-        }
-        return true;
-    }
-
-    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
-    {
-        assert(validate(_writer, _mapper, _type, dst));
-
-        _dst           = dst;
-        _ls_width_full = dst->format().w;
-
-        _coord_x      = x->scalar(0, 0).str;
-        _coord_z      = z->scalar(0, 0).str;
-        _coord_b      = b->scalar(0, 0).str;
-        _coord_orig_z = _coord_z;
-
-        out_of_bound_initialize_x(_coord_x);
-        out_of_bound_initialize_z(_coord_z);
-
-        /*
-        meaning of else:
-        - x: partial load/store
-        - y: no load/store operation
-        - z: no load/store operation
-        if(x)
-        {
-            if(z)
-            {
-                if(y)
-                {
-                    // full load/store width
-                }
-                else
-                {
-                    // no load/store
-                }
-            }
-            else
-            {
-                // no load/store
-            }
-        }
-        else
-        {
-            if(z)
-            {
-                if(y)
-                {
-                    // partial load/store width
-                }
-                else
-                {
-                    // no load/store
-                }
-            }
-            else
-            {
-                // no load/store
-            }
-        }
-        */
-    }
-
-    void write(const std::pair<int32_t, std::string> &y) override
-    {
-        int32_t     idx_y   = y.first;
-        std::string coord_y = y.second;
-
-        // The only check required is on Y.
-        out_of_bound_initialize_y(coord_y);
-
-        const std::string dst     = _dst->vector(idx_y).str;
-        const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
-        const std::string ls_buf  = to_ls_buffer(_type, _ls_width_full, dst, address);
-
-        _writer->write_text(ls_buf);
-        _writer->write_text(";\n");
-
-        out_of_bound_finalize_y(dst);
-
-        // The left over load/store will be written in the finalize stage
-        if (_ls_width_part.size() != 0)
-        {
-            int32_t w = 0;
-            for (auto &p : _ls_width_part)
-            {
-                const std::string dst0    = _dst->vector(w, p, idx_y).str;
-                const std::string coord_x = _coord_x + " + " + std::to_string(w);
-                const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
-                const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
-                _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
-
-                w += p;
-            }
-        }
-    }
-
-    void finalize() override
-    {
-        out_of_bound_finalize_z();
-        out_of_bound_finalize_x();
-    }
-
-private:
-    IVectorTile                                                             *_dst{nullptr};
-    int32_t                                                                  _ls_width_full{0};
-    std::vector<int32_t>                                                     _ls_width_part{};
-    std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x{};
-    std::string                                                              _coord_x{};
-    std::string                                                              _coord_z{};
-    std::string                                                              _coord_orig_z{};
-    std::string                                                              _coord_b{};
-
-    void out_of_bound_initialize_x(std::string &coord)
-    {
-        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-        {
-            auto tensor_format = _mapper.tensor_argument()->format();
-            auto shape         = tensor_format.shape;
-
-            _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
-            if (_ls_width_part.size() != 0)
-            {
-                _writer->write_text("if(" + coord + " > 0)\n");
-                _writer->compound_statement_begin();
-            }
-        }
-    };
-
-    void out_of_bound_finalize_x()
-    {
-        if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-        {
-            if (_ls_width_part.size() != 0)
-            {
-                _writer->compound_statement_end();
-                _writer->write_text("else\n");
-                _writer->compound_statement_begin();
-
-                out_of_bound_initialize_z(_coord_orig_z);
-                for (auto &i : _leftovers_x)
-                {
-                    out_of_bound_initialize_y(i.first.second);
-                    _writer->write_text(i.second);
-                    _writer->write_text(";\n");
-                    out_of_bound_finalize_y(i.first.first);
-                }
-                out_of_bound_finalize_z();
-                _writer->compound_statement_end();
-            }
-        }
-    };
-
-    void out_of_bound_initialize_y(std::string &coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::ClampToBorder:
-                // NOTE: This line should not be moved outside of the switch statement.
-                // The reason for that is because when we query the component, the component is marked as used
-                // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
-                // we should request the component only when used
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-                _writer->write_text("if(" + coord + " >= 0)\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if(" + coord + " < " + max + ")\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::ClampToNearest:
-                max   = _mapper.tensor_component_y();
-                coord = "clamp(" + coord + ", 0, " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-                max   = _mapper.tensor_component_y();
-                coord = "min(" + coord + ", " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-                coord = "max(" + coord + ", 0)";
-                break;
-            case TensorSamplerAddressModeY::None:
-                break;
-            default:
-                std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
-                assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_y(const std::string &dst)
-    {
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-                _writer->compound_statement_end();
-                break;
-            case TensorSamplerAddressModeY::None:
-                break;
-
-            default:
-                assert(false);
-        }
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-                _writer->write_text("else\n");
-                _writer->compound_statement_begin();
-                _writer->write_text(dst);
-                _writer->write_text(" = 0.0f;\n");
-                _writer->compound_statement_end();
-                break;
-            case TensorSamplerAddressModeY::None:
-                break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    void out_of_bound_initialize_z(std::string &coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
-        switch (address_mode_z)
-        {
-            case TensorSamplerAddressModeZ::Skip:
-                max = _mapper.tensor_component_z();
-                _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
-                _writer->write_text("if(" + coord + " >= 0)\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
-                max = _mapper.tensor_component_z();
-                _writer->write_text("if(" + coord + " < " + max + ")\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeZ::ClampToNearest:
-                max   = _mapper.tensor_component_z();
-                coord = "clamp(" + coord + ", 0, " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
-                max   = _mapper.tensor_component_z();
-                coord = "min(" + coord + ", " + max + " - 1)";
-                break;
-            case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
-                coord = "max(" + coord + ", 0)";
-                break;
-            case TensorSamplerAddressModeZ::None:
-                break;
-            default:
-                std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
-                assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_z()
-    {
-        const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
-        switch (address_mode_z)
-        {
-            case TensorSamplerAddressModeZ::Skip:
-            case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
-                _writer->compound_statement_end();
-                break;
-            case TensorSamplerAddressModeZ::None:
-                break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
-    {
-        std::vector<int32_t> x;
-
-        switch (ls_leftover_vector_width)
-        {
-            case 0:
-                break;
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 8:
-            case 16:
-                x.push_back(ls_leftover_vector_width);
-                break;
-            case 5:
-                x.push_back(4);
-                x.push_back(1);
-                break;
-            case 6:
-                x.push_back(4);
-                x.push_back(2);
-                break;
-            case 7:
-                x.push_back(4);
-                x.push_back(3);
-                break;
-            case 9:
-                x.push_back(8);
-                x.push_back(1);
-                break;
-            case 10:
-                x.push_back(8);
-                x.push_back(2);
-                break;
-            case 11:
-                x.push_back(8);
-                x.push_back(3);
-                break;
-            case 12:
-                x.push_back(8);
-                x.push_back(4);
-                break;
-            case 13:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(1);
-                break;
-            case 14:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(2);
-                break;
-            case 15:
-                x.push_back(8);
-                x.push_back(4);
-                x.push_back(3);
-                break;
-
-            default:
-                assert(false);
-        }
-        return x;
-    }
-
-    std::string
-    to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &address)
-    {
-        switch (type)
-        {
-            case GpuLoadStoreType::Load:
-                if (vector_width != 1)
-                {
-                    return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
-                }
-                else
-                {
-                    return data + " = *(" + address + ")";
-                }
-                break;
-            case GpuLoadStoreType::Store:
-                if (vector_width != 1)
-                {
-                    return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
-                }
-                else
-                {
-                    return "*(" + address + ") = " + data;
-                }
-                break;
-            default:
-                std::cout << "Unsupported GpuLoadStoreType" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string
-    to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
-    {
-        auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
-        assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
-        const std::string ptr_buf  = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
-
-        std::string address;
-        address += "(__global ";
-        address += dst_type;
-        address += "*)(";
-        address += ptr_buf;
-        if (x != "0" && (_mapper.is_one_component_x() != true))
-        {
-            address += " + (";
-            address += x + ") * sizeof(" + dst_type + ")";
-        }
-        if (y != "0")
-        {
-            const std::string stride_y = _mapper.tensor_component_stride_y();
-            address += " + (";
-            address += y + ")";
-            address += " * ";
-            address += stride_y;
-        }
-        if (z != "0")
-        {
-            const std::string stride_z = _mapper.tensor_component_stride_z();
-            address += " + (";
-            address += z + ")";
-            address += " * ";
-            address += stride_z;
-        }
-        if (b != "0" && (_mapper.is_one_component_batch() != true))
-        {
-            const std::string stride_b = _mapper.tensor_component_stride_batch();
-            address += " + (";
-            address += b + ")";
-            address += " * ";
-            address += stride_b;
-        }
-        address += ")";
-        return address;
-    }
-};
-
-class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
-    static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
-    {
-        CKW_UNUSED(x);
-
-        if (dst->format().w != 4)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
-        {
-            return false;
-        }
-        if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly &&
-            type == GpuLoadStoreType::Store)
-        {
-            return false;
-        }
-        if ((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
-        {
-            return false;
-        }
-        return true;
-        /*
-        - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
-        - z: Only GpuSamplerAddressModeZ::None is supported
-        */
-    }
-
-    ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
-        : IGpuLoadStoreHelperWriter(x, mapper, type)
-    {
-    }
-
-    ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
-
-    ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
-
-    void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
-    {
-        assert(validate(_writer, _mapper, _type, dst));
-
-        _dst           = dst;
-        _ls_width_full = dst->format().w;
-        _coord_x       = x->scalar(0, 0).str;
-        _coord_z       = z->scalar(0, 0).str;
-        _coord_b       = b->scalar(0, 0).str;
-
-        /*
-        if(y)
-        {
-            // full load/store width
-        }
-        else
-        {
-            // no load/store
-        }
-        */
-    }
-
-    void write(const std::pair<int32_t, std::string> &y) override
-    {
-        int32_t     idx_y   = y.first;
-        std::string coord_y = y.second;
-
-        // The only check required is on Y.
-        out_of_bound_initialize_y(coord_y);
-
-        const std::string dst     = _dst->vector(idx_y).str;
-        const std::string sampler = to_ls_image2d_sampler();
-        const std::string coord   = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
-        const std::string ls_buf  = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
-
-        _writer->write_text(ls_buf);
-        _writer->write_text(";\n");
-
-        out_of_bound_finalize_y(dst);
-    }
-
-    void finalize() override
-    {
-    }
-
-private:
-    IVectorTile *_dst{nullptr};
-    int32_t      _ls_width_full{0};
-    std::string  _coord_x{};
-    std::string  _coord_z{};
-    std::string  _coord_b{};
-
-    void out_of_bound_initialize_y(std::string &coord)
-    {
-        std::string max = "";
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-                _writer->write_text("if(" + coord + " >= 0)\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-                max = _mapper.tensor_component_y();
-                _writer->write_text("if(" + coord + " < " + max + ")\n");
-                _writer->compound_statement_begin();
-                break;
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToNearest:
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-            case TensorSamplerAddressModeY::None:
-                break;
-            default:
-                std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
-                assert(false);
-        }
-    };
-
-    void out_of_bound_finalize_y(const std::string &dst)
-    {
-        CKW_UNUSED(dst);
-
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-                _writer->compound_statement_end();
-                break;
-
-            default:
-                assert(false);
-        }
-    };
-
-    std::string to_ls_image2d(GpuLoadStoreType   type,
-                              int32_t            vector_width,
-                              const std::string &data,
-                              const std::string &sampler,
-                              const std::string &coord)
-    {
-        CKW_UNUSED(vector_width);
-
-        auto              tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
-        const std::string image2d_obj    = _mapper.tensor_argument()->storage(tensor_storage);
-        const std::string post_fix       = _dst->format().dt == DataType::Fp32 ? "f" : "h";
-
-        switch (type)
-        {
-            case GpuLoadStoreType::Load:
-                return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
-                break;
-            case GpuLoadStoreType::Store:
-                return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
-            default:
-                assert(false);
-                std::cout << "Unsupported GpuLoadStoreType" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string to_ls_image2d_sampler() const
-    {
-        const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
-        switch (address_mode_y)
-        {
-            case TensorSamplerAddressModeY::None:
-                return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
-            case TensorSamplerAddressModeY::Skip:
-            case TensorSamplerAddressModeY::SkipMinEdgeOnly:
-            case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorder:
-            case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
-                return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
-            case TensorSamplerAddressModeY::ClampToNearest:
-            case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
-            case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
-                return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
-            default:
-                std::cout << "Unsupported address_mode_coord" << std::endl;
-                assert(false);
-                return "";
-        }
-    }
-
-    std::string
-    to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
-    {
-        std::string coord_x = "(" + x + ") >> 2";
-        std::string coord_y = "(";
-
-        if (y != "0")
-        {
-            coord_y += y;
-        }
-        if (z != "0" && (_mapper.is_one_component_z() != true))
-        {
-            const std::string dim = _mapper.tensor_component_y();
-            coord_y += " + (";
-            coord_y += z + ")";
-            coord_y += " * ";
-            coord_y += dim;
-        }
-        if (b != "0" && (_mapper.is_one_component_batch() != true))
-        {
-            const std::string dim0 = _mapper.tensor_component_y();
-            const std::string dim1 = _mapper.tensor_component_z();
-            coord_y += " + (";
-            coord_y += b + ")";
-            coord_y += " * ";
-            coord_y += dim0;
-            coord_y += " * ";
-            coord_y += dim1;
-        }
-        coord_y += ")";
-        return "(int2)(" + coord_x + ", " + coord_y + ")";
-    }
-};
-
-/** IGpuLoadStoreHelperWriter factory class */
-class ClLoadStoreHelperWriterFactory final
-{
-public:
-    /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
-     *
-     *
-     * @return IGpuLoadStoreHelperWriter
-     */
-    static std::unique_ptr<IGpuLoadStoreHelperWriter>
-    create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
-    {
-        const auto tensor_storage = mapper.gpu_sampler().storage;
-        switch (tensor_storage)
-        {
-            case GpuSamplerTensorStorage::BufferUint8Ptr:
-                return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
-            case GpuSamplerTensorStorage::Image2dReadOnly:
-            case GpuSamplerTensorStorage::Image2dWriteOnly:
-                return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
-            default:
-                std::cout << "Unsupported Gpu tensor storage" << std::endl;
-                assert(false);
-                return nullptr;
-        }
-    }
-};
-
-// This utility method needs to go in utils.h
-inline bool is_tile_scalar(const IVectorTile *x)
-{
-    return x->format().w == 1 && x->format().h == 1;
-}
-
-class ClKernelWriter : public IGpuKernelWriter
-{
-public:
-    ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
-    {
-        _data = x;
-        _attr = attr;
-    }
-
-    ClKernelWriter(const ClKernelWriter &) = default;
-
-    ClKernelWriter &operator=(const ClKernelWriter &) = default;
-
-    // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
-    // there are no conflicts or ambiguity in the code
-    void set_IdSpace(int32_t id) override
-    {
-        _data->tiles.set_IdSpace(id);
-        _data->arguments.set_IdSpace(id);
-    }
-
-    void import_tile(const std::string &dst_name, const IVectorTile *src) override
-    {
-        _data->tiles.insert(dst_name, src);
-    }
-
-    void declare_argument(const std::string &name, const TensorInfo &tensor) override
-    {
-        assert(_data->arguments[name] == nullptr);
-        _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
-    }
-
-    void declare_tile(const std::string &name, const TileInfo &format) override
-    {
-        assert(_data->tiles[name] == nullptr);
-        _data->tiles.insert(name, format);
-
-        IVectorTile *x = _data->tiles[name];
-
-        for (auto &t : x->underlying_source_variables())
-        {
-            _data->code += t.type.str + " " + t.str + ";\n";
-        }
-    }
-
-    void
-    declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) override
-    {
-        assert(_data->tiles[name] == nullptr);
-        _data->tiles.insert(name, in, dt);
-        // Note: A constant does not need to be declared in the code
-    }
-
-    void write_text(const std::string &x) override
-    {
-        _data->code += x;
-    }
-
-    void compound_statement_begin() override
-    {
-        _data->tiles.increment_registry_level();
-        _data->code += "{\n";
-    }
-
-    void compound_statement_end() override
-    {
-        _data->tiles.decrement_registry_level();
-        _data->code += "}\n";
-    }
-
-    void op_get_global_id(const Operand &dst_var, int32_t dim) override
-    {
-        assert(dst_var.type() == OperandType::Tile);
-        assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
-               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
-        auto var = _data->tiles[dst_var.value()];
-
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " = get_global_id(";
-        _data->code += std::to_string(dim);
-        _data->code += ");\n";
-    };
-
-    void op_get_global_coord(const Operand       &o_dst,
-                             const Operand       &o_step,
-                             const TensorOperand &o_tensor,
-                             int32_t              dim) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-        auto            dst  = operands.unpack(o_dst);
-        auto            step = operands.unpack(o_step);
-
-        // Validation: Check that x, y and z are scalar
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        auto                  tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        switch (dim)
-        {
-            case 0:
-                if (mapper.is_one_component_x())
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = 0;\n";
-                }
-                else
-                {
-                    if (mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
-                    {
-                        // Validation: Check: fixed tensor shape
-                        // TO BE CHANGED
-                        _data->code += dst->scalar(0, 0).str;
-                        _data->code += " = get_global_id(0) * ";
-                        _data->code += step->scalar(0, 0).str;
-                        _data->code += ";\n";
-                    }
-                    else
-                    {
-                        _data->code += dst->scalar(0, 0).str;
-                        _data->code += " = get_global_id(0) * ";
-                        _data->code += step->scalar(0, 0).str;
-                        _data->code += ";\n";
-                    }
-                }
-                break;
-            case 1:
-                if (mapper.is_one_component_y())
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = 0;\n";
-                }
-                else
-                {
-                    if (mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
-                    {
-                    }
-                    else
-                    {
-                        _data->code += dst->scalar(0, 0).str;
-                        _data->code += " = get_global_id(1) * ";
-                        _data->code += step->scalar(0, 0).str;
-                        _data->code += ";\n";
-                    }
-                }
-                break;
-            case 2:
-                if (mapper.is_one_component_z())
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = 0;\n";
-                }
-                else
-                {
-                    _data->code += dst->scalar(0, 0).str;
-                    _data->code += " = get_global_id(2) * ";
-                    _data->code += step->scalar(0, 0).str;
-                    _data->code += ";\n";
-                }
-                break;
-            default:
-                break;
-        }
-    };
-
-    void op_get_global_batch(const Operand &o_dst, const TensorOperand &o_tensor) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *dst = operands.unpack(o_dst);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        if (mapper.is_one_component_batch())
-        {
-            _data->code += dst->scalar(0, 0).str;
-            _data->code += " = 0;\n";
-        }
-        else
-        {
-            std::cout << "Unsupported batched computation" << std::endl;
-            assert(false);
-        }
-    };
-
-    void op_get_global_size(const Operand &dst_var, int32_t dim) override
-    {
-        assert(dst_var.type() == OperandType::Tile);
-        assert(_data->tiles.has_tile(dst_var.value()));
-        assert(_data->tiles[dst_var.value()]->format().w == 1 &&
-               _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
-        auto var = _data->tiles[dst_var.value()];
-
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " = get_global_size(";
-        _data->code += std::to_string(dim);
-        _data->code += ");\n";
-    }
-
-    void op_unary_expression(const Operand &dst_name, UnaryOp op, const Operand &src_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(src_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t     dst_w = dst->format().w;
-        const int32_t     dst_h = dst->format().h;
-        const int32_t     src_w = src->format().w;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
-        const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += to_string(op);
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ";\n";
-        }
-    }
-
-    void op_binary_expression(const Operand &dst_name,
-                              const Operand &lhs_name,
-                              BinaryOp       op,
-                              const Operand &rhs_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *lhs = operands.unpack(lhs_name);
-        const IVectorTile *rhs = operands.unpack(rhs_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t dst_w = dst->format().w;
-        const int32_t dst_h = dst->format().h;
-        assert(lhs != nullptr);
-        const int32_t lhs_w = lhs->format().w;
-        const int32_t rhs_w = rhs->format().w;
-
-        if (op == BinaryOp::MatMul_Nt_T)
-        {
-            assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
-            for (int32_t y = 0; y < dst_h; ++y)
-            {
-                for (int32_t x = 0; x < dst_w; ++x)
-                {
-                    for (int32_t k = 0; k < lhs_w; ++k)
-                    {
-                        _data->code += dst->scalar(x, y).str;
-                        _data->code += " = fma(";
-                        _data->code += lhs->scalar(k, y).str;
-                        _data->code += ", ";
-                        _data->code += rhs->scalar(k, x).str;
-                        _data->code += ", ";
-                        _data->code += dst->scalar(x, y).str;
-                        _data->code += ");\n";
-                    }
-                }
-            }
-
-            return;
-        }
-
-        const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
-        const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
-
-        const std::string lhs_prefix =
-            broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string rhs_prefix =
-            broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
-        const std::string op_str = to_string(op);
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += lhs_prefix + lhs->vector(y).str;
-            _data->code += " ";
-            _data->code += op_str;
-            _data->code += " ";
-            _data->code += rhs_prefix + rhs->vector(y).str;
-            _data->code += ";\n";
-        }
-    };
-
-    void op_cast_expression(const Operand &o_dst, const Operand &o_src, ConvertPolicy policy) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(o_src);
-        const IVectorTile *dst = operands.unpack(o_dst);
-        // const int32_t dst_w  = dst->format().w;
-        const int32_t     dst_h    = dst->format().h;
-        const std::string dt       = dst->underlying_source_variables()[0].type.str;
-        const bool        is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
-        const std::string sat      = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = convert_" + dt + sat + "(";
-            _data->code += src->vector(y).str;
-            _data->code += ");\n";
-        }
-    };
-
-    void op_assign(const Operand &dst_name, const Operand &src_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(src_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t     dst_w = dst->format().w;
-        const int32_t     dst_h = dst->format().h;
-        const int32_t     src_w = src->format().w;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
-        const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ";\n";
-        }
-    }
-
-    void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *src = operands.unpack(src_name);
-        const IVectorTile *dst = operands.unpack(dst_name);
-
-        const int32_t     dst_h = dst->format().h;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        // Always perform an explicit cast. This automatically covers at least the 2 scenarios:
-        // 1. Widen a scalar into a vector type. This enables scalar-vector broadcasting
-        // 2. Ensure non-ambiguity over function overloads.
-        //    E.g. a constant tile may be accidentally initialized with a double literal. By casting it to single float,
-        //    it avoids ambiguous function calls
-        const std::string src_prefix = "(" + dt + ")";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch (func)
-            {
-                case UnaryFunction::Exp:
-                    _data->code += "exp(";
-                    break;
-                case UnaryFunction::Tanh:
-                    _data->code += "tanh(";
-                    break;
-                case UnaryFunction::Sqrt:
-                    _data->code += "sqrt(";
-                    break;
-                case UnaryFunction::Erf:
-                    _data->code += "erf(";
-                    break;
-                case UnaryFunction::Fabs:
-                    _data->code += "fabs(";
-                    break;
-                case UnaryFunction::Log:
-                    _data->code += "log(";
-                    break;
-                case UnaryFunction::SizeOf:
-                    _data->code += "sizeof(";
-                    break;
-                case UnaryFunction::Round:
-                    _data->code += "round(";
-                    break;
-                case UnaryFunction::Floor:
-                    _data->code += "floor(";
-                    break;
-                default:
-                    CKW_ASSERT_MSG(false, "Unexpected UnaryFunction used.");
-            }
-
-            _data->code += src_prefix + src->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_binary_elementwise_function(const Operand &dst_name,
-                                        BinaryFunction func,
-                                        const Operand &first_name,
-                                        const Operand &second_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *first  = operands.unpack(first_name);
-        const IVectorTile *second = operands.unpack(second_name);
-        const IVectorTile *dst    = operands.unpack(dst_name);
-
-        const int32_t     dst_h        = dst->format().h;
-        const auto        datatype     = dst->underlying_source_variables()[0].type;
-        const std::string datatype_str = datatype.str;
-
-        // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
-        const std::string first_prefix  = "(" + datatype_str + ")";
-        const std::string second_prefix = "(" + datatype_str + ")";
-
-        const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16);
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch (func)
-            {
-                case BinaryFunction::Min:
-                    _data->code += is_float ? "fmin(" : "min(";
-                    break;
-                case BinaryFunction::Max:
-                    _data->code += is_float ? "fmax(" : "max(";
-                    break;
-                default:
-                    CKW_ASSERT_MSG(false, "Unexpected BinaryFunction used.");
-            }
-
-            _data->code += first_prefix + first->vector(y).str;
-            _data->code += ", ";
-            _data->code += second_prefix + second->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_ternary_elementwise_function(const Operand  &dst_name,
-                                         TernaryFunction func,
-                                         const Operand  &first_name,
-                                         const Operand  &second_name,
-                                         const Operand  &third_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *first  = operands.unpack(first_name);
-        const IVectorTile *second = operands.unpack(second_name);
-        const IVectorTile *third  = operands.unpack(third_name);
-        const IVectorTile *dst    = operands.unpack(dst_name);
-
-        const int32_t     dst_h = dst->format().h;
-        const std::string dt    = dst->underlying_source_variables()[0].type.str;
-
-        // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
-        const std::string first_prefix  = "(" + dt + ")";
-        const std::string second_prefix = "(" + dt + ")";
-        const std::string third_prefix  = "(" + dt + ")";
-
-        // Broadcasting on Y is automatic
-        for (int32_t y = 0; y < dst_h; ++y)
-        {
-            _data->code += dst->vector(y).str;
-            _data->code += " = ";
-
-            switch (func)
-            {
-                case TernaryFunction::Select:
-                    _data->code += "select(";
-                    break;
-                case TernaryFunction::Clamp:
-                    _data->code += "clamp(";
-                    break;
-                default:
-                    CKW_ASSERT_MSG(false, "Unexpected TernaryFunction used.");
-            }
-
-            _data->code += first_prefix + first->vector(y).str;
-            _data->code += ", ";
-            _data->code += second_prefix + second->vector(y).str;
-            _data->code += ", ";
-            _data->code += third_prefix + third->vector(y).str;
-            _data->code += ");\n";
-        }
-    }
-
-    void op_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *lhs = operands.unpack(o_lhs);
-        const IVectorTile *rhs = operands.unpack(o_rhs);
-
-        assert(is_tile_scalar(lhs));
-        assert(is_tile_scalar(rhs));
-
-        _data->code += "if(";
-        _data->code += lhs->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(op);
-        _data->code += " ";
-        _data->code += rhs->scalar(0, 0).str;
-        _data->code += ")\n";
-    }
-
-    void op_else_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override
-    {
-        _data->code += "else ";
-        op_if_header(o_lhs, op, o_rhs);
-    }
-
-    void op_else_header() override
-    {
-        _data->code += "else\n";
-    }
-
-    void op_for_loop_header(const Operand &var_name,
-                            BinaryOp       cond_op,
-                            const Operand &cond_value_name,
-                            const Operand &update_var_name,
-                            AssignmentOp   update_op,
-                            const Operand &update_value_name) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *var          = operands.unpack(var_name);
-        const IVectorTile *cond_value   = operands.unpack(cond_value_name);
-        const IVectorTile *update_var   = operands.unpack(update_var_name);
-        const IVectorTile *update_value = operands.unpack(update_value_name);
-
-        const int32_t dst_w = var->format().w;
-        const int32_t dst_h = var->format().h;
-
-        // It must be a scalar variable
-        CKW_UNUSED(dst_w, dst_h);
-        assert(dst_w == 1);
-        assert(dst_h == 1);
-
-        _data->code += "for(; ";
-        _data->code += var->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(cond_op);
-        _data->code += " " + cond_value->scalar(0, 0).str + "; ";
-        _data->code += update_var->scalar(0, 0).str;
-        _data->code += " ";
-        _data->code += to_string(update_op);
-        _data->code += " " + update_value->scalar(0, 0).str + ")";
-        _data->code += "\n";
-    }
-
-    void op_load_immediate(const TensorOperand &o_tensor,
-                           const Operand       &o_dst,
-                           const Operand       &o_x,
-                           const Operand       &o_y,
-                           const Operand       &o_z,
-                           const Operand       &o_batch_idx,
-                           const Operand       &dilation_y) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-
-        // Not const as it requires changes to 'load_writer'.
-        IVectorTile *dst   = operands.unpack(o_dst);
-        IVectorTile *x     = operands.unpack(o_x);
-        IVectorTile *y     = operands.unpack(o_y);
-        IVectorTile *z     = operands.unpack(o_z);
-        IVectorTile *dil_y = operands.unpack(dilation_y);
-        IVectorTile *b     = operands.unpack(o_batch_idx);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
-        // Initialize the constant part
-        load_writer->initialize(dst, x, z, b);
-
-        for (int i = 0; i < dst->format().h; ++i)
-        {
-            std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
-            if (dil_y->scalar(0, 0).str != "1")
-            {
-                coord_y += " * " + dil_y->scalar(0, 0).str;
-            }
-            load_writer->write(std::make_pair(i, coord_y));
-        }
-
-        load_writer->finalize();
-    }
-
-    void op_load_indirect(const TensorOperand &o_tensor,
-                          const Operand       &o_dst,
-                          const Operand       &o_x,
-                          const Operand       &o_indirect_h,
-                          const Operand       &o_z,
-                          const Operand       &o_batch_idx) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-
-        // Not const as it requires changes to 'load_writer'.
-        IVectorTile *dst   = operands.unpack(o_dst);
-        IVectorTile *x     = operands.unpack(o_x);
-        IVectorTile *y_ind = operands.unpack(o_indirect_h);
-        IVectorTile *z     = operands.unpack(o_z);
-        IVectorTile *b     = operands.unpack(o_batch_idx);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(o_tensor);
-        auto                  gpu_sampler = o_tensor.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
-        // Initialize the constant part
-        load_writer->initialize(dst, x, z, b);
-
-        for (int i = 0; i < dst->format().h; ++i)
-        {
-            load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
-        }
-
-        load_writer->finalize();
-    }
-
-    void op_store_immediate(const TensorOperand &tensor_name,
-                            const Operand       &src_name,
-                            const Operand       &x_name,
-                            const Operand       &y_name,
-                            const Operand       &z_name,
-                            const Operand       &batch_index_name) override
-    {
-        OperandUnpacker operands(_data->tiles, _data->arguments);
-
-        // Not const as it requires changes to 'load_writer'.
-        IVectorTile *src = operands.unpack(src_name);
-        IVectorTile *x   = operands.unpack(x_name);
-        IVectorTile *y   = operands.unpack(y_name);
-        IVectorTile *z   = operands.unpack(z_name);
-        IVectorTile *b   = operands.unpack(batch_index_name);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor      = tensor_operands.unpack(tensor_name);
-        auto                  gpu_sampler = tensor_name.sampler();
-
-        GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
-        auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
-
-        // Initialize the constant part
-        store_writer->initialize(src, x, z, b);
-
-        int32_t tile_h = src->format().h;
-
-        for (int m0 = tile_h - 1; m0 >= 0; m0--)
-        {
-            store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
-        }
-
-        store_writer->finalize();
-    }
-
-    void op_return() override
-    {
-        _data->code += "return;\n";
-    }
-
-    void util_get_indirect_buffer(const Operand       &o_dst,
-                                  const TensorOperand &o_tensor,
-                                  const Operand       &o_x,
-                                  const Operand       &o_y,
-                                  const Operand       &o_x_off,
-                                  const Operand       &o_y_off) override
-    {
-        OperandUnpacker    operands(_data->tiles, _data->arguments);
-        const IVectorTile *dst   = operands.unpack(o_dst);
-        const IVectorTile *x     = operands.unpack(o_x);
-        const IVectorTile *y     = operands.unpack(o_y);
-        const IVectorTile *x_off = operands.unpack(o_x_off);
-        const IVectorTile *y_off = operands.unpack(o_y_off);
-
-        TensorOperandUnpacker tensor_operands(_data->arguments);
-        IGpuTensorArgument   *tensor = tensor_operands.unpack(o_tensor);
-
-        assert(dst->format().w == 1);
-        assert(x->format().w == 1);
-        assert(y->format().w == 1);
-        assert(x_off->format().w == 1);
-        assert(y_off->format().w == 1);
-        assert(dst->format().dt == DataType::Int32);
-        assert(x->format().dt == DataType::Int32);
-        assert(y->format().dt == DataType::Int32);
-        assert(x_off->format().dt == DataType::Int32);
-        assert(y_off->format().dt == DataType::Int32);
-
-        const std::string width  = tensor->component(TensorComponentType::Dim1);
-        const std::string height = tensor->component(TensorComponentType::Dim2);
-        const std::string wxh    = tensor->component(TensorComponentType::Dim1xDim2);
-        /*
-        int x_s;
-        int y_s;
-        x_s = (xi_0 + x_k);
-        y_s = (yi_0 + y_k);
-        mi_0 = x_s + y_s * width + b * widthxheight;
-        mi_0 = select(-1, mi_0, x_s >= 0);
-        mi_0 = select(-1, mi_0, y_s >= 0);
-        mi_0 = select(-1, mi_0, x_s < 128);
-        mi_0 = select(-1, mi_0, y_s < 128);
-        */
-        compound_statement_begin();
-        declare_tile("_x_s", TileInfo(DataType::Int32));
-        declare_tile("_y_s", TileInfo(DataType::Int32));
-        auto x_s = operands.unpack(Operand("_x_s"));
-        auto y_s = operands.unpack(Operand("_y_s"));
-        for (int i = 0; i < dst->format().h; ++i)
-        {
-            // x_s = (xi_0 + x_k);
-            // y_s = (yi_0 + y_k);
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " = (";
-            _data->code += x->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += x_off->scalar(0, i).str;
-            _data->code += ");\n";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " = (";
-            _data->code += y->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += y_off->scalar(0, i).str;
-            _data->code += ");\n";
-            // mi_0 = x_s + y_s * width;
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " + ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " * " + width + ";\n";
-            // mi_0 = select(wxh, mi_0, x_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, x_s < width);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += x_s->scalar(0, i).str;
-            _data->code += " < ";
-            _data->code += width + ");\n";
-            // mi_0 = select(wxh, mi_0, y_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, y_s < height);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " < ";
-            _data->code += height + ");\n";
-        }
-        compound_statement_end();
-    }
-
-private:
-    GpuKernelWriterDataHolder *_data{nullptr};
-    GpuKernelWriterAttribute  *_attr{nullptr};
-};
-
-/** IGpuKernelWriter factory class */
-class GpuKernelWriterFactory final
-{
-public:
-    /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return IGpuKernelWriter
-     */
-    static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
-    {
-        switch (x->programming_language())
-        {
-            case GpuTargetLanguage::OpenCL:
-                return std::make_unique<ClKernelWriter>(attr, x);
-            default:
-                std::cout << "Unsupported Gpu programming language" << std::endl;
-                assert(false);
-                return nullptr;
-        }
-    }
-};
-
-inline int32_t
-adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
-{
-    auto tensor = tensor_info_id->shape;
-
-    int32_t dim[3] = {0};
-
-    switch (tensor_format)
-    {
-        case TensorSamplerFormat::C_W_H:
-            dim[0] = tensor[0];
-            dim[1] = tensor[1];
-            dim[2] = tensor[2];
-            break;
-        case TensorSamplerFormat::C_WH_1:
-            dim[0] = tensor[0];
-            dim[1] = tensor[1] * tensor[2];
-            dim[2] = 1;
-            break;
-        default:
-            std::cout << "Unsupported tensor format" << std::endl;
-            assert(false);
-            break;
-    }
-
-    return std::min(step, dim[idx]);
-}
-
-} // namespace prototype
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H
diff --git a/compute_kernel_writer/prototype/src/TensorInfo.cpp b/compute_kernel_writer/prototype/src/TensorInfo.cpp
deleted file mode 100644
index 561c126469..0000000000
--- a/compute_kernel_writer/prototype/src/TensorInfo.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorInfo.h"
-
-namespace ckw
-{
-TensorInfo::TensorInfo(DataType dt, const TensorShape &shape, TensorDataLayout dl, int32_t id)
-    : _shape(shape), _dt(dt), _dl(dl), _id(id)
-{
-}
-
-TensorInfo &TensorInfo::shape(const TensorShape &shape)
-{
-    _shape = shape;
-    return *this;
-}
-
-TensorShape TensorInfo::shape() const
-{
-    return _shape;
-}
-
-TensorInfo &TensorInfo::data_type(DataType dt)
-{
-    _dt = dt;
-    return *this;
-}
-
-DataType TensorInfo::data_type() const
-{
-    return _dt;
-}
-
-TensorInfo &TensorInfo::data_layout(TensorDataLayout dl)
-{
-    _dl = dl;
-    return *this;
-}
-
-TensorDataLayout TensorInfo::data_layout() const
-{
-    return _dl;
-}
-
-TensorInfo &TensorInfo::id(int32_t id)
-{
-    _id = id;
-    return *this;
-}
-
-int32_t TensorInfo::id() const
-{
-    return _id;
-}
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TensorOperand.cpp b/compute_kernel_writer/prototype/src/TensorOperand.cpp
deleted file mode 100644
index d1aefbbb71..0000000000
--- a/compute_kernel_writer/prototype/src/TensorOperand.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorOperand.h"
-
-#include "ckw/Error.h"
-#include "ckw/Kernel.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/TileOperand.h"
-
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-namespace
-{
-
-TensorComponentOperand &get_or_create_component(TensorOperand                           &tensor,
-                                                std::unique_ptr<TensorComponentOperand> &ptr,
-                                                TensorComponentType                      component)
-{
-    if (ptr == nullptr)
-    {
-        ptr = std::make_unique<TensorComponentOperand>(tensor, component);
-    }
-
-    return *ptr;
-}
-
-} // namespace
-
-// =================================================================================================
-// TensorOperand
-// =================================================================================================
-
-TensorOperand::TensorOperand(const std::string &name, const TensorInfo &info, TensorStorageType storage_type)
-    : OperandBase(name), _info(info), _storage_type(storage_type)
-{
-}
-
-prototype::Operand TensorOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-    return {name()};
-}
-
-const TensorInfo &TensorOperand::info() const
-{
-    return _info;
-}
-
-TensorInfo &TensorOperand::info()
-{
-    return _info;
-}
-
-TensorStorageType TensorOperand::storage_type() const
-{
-    return _storage_type;
-}
-
-DataType TensorOperand::data_type() const
-{
-    return _info.data_type();
-}
-
-bool TensorOperand::is_constant() const
-{
-    return false;
-}
-
-const TileOperand &TensorOperand::tile() const
-{
-    return *_tile;
-}
-
-TileOperand &TensorOperand::tile()
-{
-    return *_tile;
-}
-
-TensorOperand &TensorOperand::tile(TileOperand &tile)
-{
-    _tile = &tile;
-    return *this;
-}
-
-const TensorTileSampler &TensorOperand::tile_sampler() const
-{
-    return _tile_sampler;
-}
-
-TensorTileSampler &TensorOperand::tile_sampler()
-{
-    return _tile_sampler;
-}
-
-TensorOperand &TensorOperand::tile_sampler(const TensorTileSampler &value)
-{
-    _tile_sampler = value;
-    return *this;
-}
-
-TensorComponentOperand &TensorOperand::stride1()
-{
-    return get_or_create_component(*this, _stride1, TensorComponentType::Stride1);
-}
-
-TensorComponentOperand &TensorOperand::stride2()
-{
-    return get_or_create_component(*this, _stride2, TensorComponentType::Stride2);
-}
-
-TensorComponentOperand &TensorOperand::stride3()
-{
-    return get_or_create_component(*this, _stride3, TensorComponentType::Stride3);
-}
-
-TensorComponentOperand &TensorOperand::stride4()
-{
-    return get_or_create_component(*this, _stride4, TensorComponentType::Stride4);
-}
-
-TensorComponentOperand &TensorOperand::dim0()
-{
-    return get_or_create_component(*this, _dim0, TensorComponentType::Dim0);
-}
-
-TensorComponentOperand &TensorOperand::dim1()
-{
-    return get_or_create_component(*this, _dim1, TensorComponentType::Dim1);
-}
-
-TensorComponentOperand &TensorOperand::dim2()
-{
-    return get_or_create_component(*this, _dim2, TensorComponentType::Dim2);
-}
-
-TensorComponentOperand &TensorOperand::dim3()
-{
-    return get_or_create_component(*this, _dim3, TensorComponentType::Dim3);
-}
-
-TensorComponentOperand &TensorOperand::dim4()
-{
-    return get_or_create_component(*this, _dim4, TensorComponentType::Dim4);
-}
-
-TensorComponentOperand &TensorOperand::dim1_dim2()
-{
-    return get_or_create_component(*this, _dim1_dim2, TensorComponentType::Dim1xDim2);
-}
-
-TensorComponentOperand &TensorOperand::dim1_dim2_dim3()
-{
-    return get_or_create_component(*this, _dim1_dim2_dim3, TensorComponentType::Dim1xDim2xDim3);
-}
-
-TensorComponentOperand &TensorOperand::offset_first_element_in_bytes()
-{
-    return get_or_create_component(*this, _offset_first_element_in_bytes, TensorComponentType::OffsetFirstElement);
-}
-
-// =================================================================================================
-// TensorComponentOperand
-// =================================================================================================
-
-TensorComponentOperand::TensorComponentOperand(TensorOperand &tensor, TensorComponentType component)
-    : TileOperand(tensor.name(), DataType::Int32), _tensor(tensor), _component(component)
-{
-}
-
-TensorOperand &TensorComponentOperand::tensor()
-{
-    return _tensor;
-}
-
-const TensorOperand &TensorComponentOperand::tensor() const
-{
-    return _tensor;
-}
-
-TensorComponentType TensorComponentOperand::component_type() const
-{
-    return _component;
-}
-
-prototype::Operand TensorComponentOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-    prototype::OperandType type{prototype::OperandType::Unknown};
-
-    switch (_component)
-    {
-        case TensorComponentType::OffsetFirstElement:
-            type = prototype::OperandType::TensorDataOffset;
-            break;
-
-        case TensorComponentType::Stride1:
-            type = prototype::OperandType::TensorStride1;
-            break;
-
-        case TensorComponentType::Stride2:
-            type = prototype::OperandType::TensorStride2;
-            break;
-
-        case TensorComponentType::Stride3:
-            type = prototype::OperandType::TensorStride3;
-            break;
-
-        case TensorComponentType::Stride4:
-            type = prototype::OperandType::TensorStride4;
-            break;
-
-        case TensorComponentType::Dim0:
-            type = prototype::OperandType::TensorDim0;
-            break;
-
-        case TensorComponentType::Dim1:
-            type = prototype::OperandType::TensorDim1;
-            break;
-
-        case TensorComponentType::Dim2:
-            type = prototype::OperandType::TensorDim2;
-            break;
-
-        case TensorComponentType::Dim3:
-            type = prototype::OperandType::TensorDim3;
-            break;
-
-        case TensorComponentType::Dim4:
-            type = prototype::OperandType::TensorDim4;
-            break;
-
-        case TensorComponentType::Dim1xDim2:
-            type = prototype::OperandType::TensorDim1xDim2;
-            break;
-
-        case TensorComponentType::Dim1xDim2xDim3:
-            type = prototype::OperandType::TensorDim1xDim2xDim3;
-            break;
-
-        default:
-            CKW_ASSERT(false);
-    }
-
-    return prototype::Operand(name(), type);
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp b/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
deleted file mode 100644
index bf9f946ce8..0000000000
--- a/compute_kernel_writer/prototype/src/TensorTileSampler.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TensorTileSampler.h"
-
-#include "ckw/TileOperand.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
-namespace ckw
-{
-
-TensorTileSampler::TensorTileSampler()
-{
-}
-
-TensorTileSampler::TensorTileSampler(TileOperand              &x,
-                                     TileOperand              &y,
-                                     TileOperand              &z,
-                                     TileOperand              &b,
-                                     TensorSamplerFormat       format,
-                                     TensorSamplerAddressModeX address_mode_x,
-                                     TensorSamplerAddressModeY address_mode_y,
-                                     TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x),
-      _y(&y),
-      _z(&z),
-      _b(&b),
-      _height(0),
-      _width(0),
-      _format(format),
-      _address_mode_x(address_mode_x),
-      _address_mode_y(address_mode_y),
-      _address_mode_z(address_mode_z)
-{
-}
-
-TensorTileSampler::TensorTileSampler(TileOperand              &x,
-                                     TileOperand              &y,
-                                     TileOperand              &z,
-                                     TileOperand              &b,
-                                     int32_t                   height,
-                                     int32_t                   width,
-                                     TensorSamplerFormat       format,
-                                     TensorSamplerAddressModeX address_mode_x,
-                                     TensorSamplerAddressModeY address_mode_y,
-                                     TensorSamplerAddressModeZ address_mode_z)
-    : _x(&x),
-      _y(&y),
-      _z(&z),
-      _b(&b),
-      _height(height),
-      _width(width),
-      _format(format),
-      _address_mode_x(address_mode_x),
-      _address_mode_y(address_mode_y),
-      _address_mode_z(address_mode_z)
-{
-}
-
-const TileOperand &TensorTileSampler::x() const
-{
-    return *_x;
-}
-
-TensorTileSampler &TensorTileSampler::x(TileOperand &x)
-{
-    _x = &x;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::y() const
-{
-    return *_y;
-}
-
-TensorTileSampler &TensorTileSampler::y(TileOperand &y)
-{
-    _y = &y;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::z() const
-{
-    return *_z;
-}
-
-TensorTileSampler &TensorTileSampler::z(TileOperand &z)
-{
-    _z = &z;
-    return *this;
-}
-
-const TileOperand &TensorTileSampler::b() const
-{
-    return *_b;
-}
-
-TensorTileSampler &TensorTileSampler::b(TileOperand &b)
-{
-    _b = &b;
-    return *this;
-}
-
-int32_t TensorTileSampler::width() const
-{
-    return _width;
-}
-
-TensorTileSampler &TensorTileSampler::width(int32_t width)
-{
-    _width = width;
-    return *this;
-}
-
-int32_t TensorTileSampler::height() const
-{
-    return _height;
-}
-
-TensorTileSampler &TensorTileSampler::height(int32_t height)
-{
-    _height = height;
-    return *this;
-}
-
-TensorSamplerFormat TensorTileSampler::format() const
-{
-    return _format;
-}
-
-TensorTileSampler &TensorTileSampler::format(TensorSamplerFormat format)
-{
-    _format = format;
-    return *this;
-}
-
-TensorSamplerAddressModeX TensorTileSampler::address_mode_x() const
-{
-    return _address_mode_x;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_x(TensorSamplerAddressModeX address_mode_x)
-{
-    _address_mode_x = address_mode_x;
-    return *this;
-}
-
-TensorSamplerAddressModeY TensorTileSampler::address_mode_y() const
-{
-    return _address_mode_y;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_y(TensorSamplerAddressModeY address_mode_y)
-{
-    _address_mode_y = address_mode_y;
-    return *this;
-}
-
-TensorSamplerAddressModeZ TensorTileSampler::address_mode_z() const
-{
-    return _address_mode_z;
-}
-
-TensorTileSampler &TensorTileSampler::address_mode_z(TensorSamplerAddressModeZ address_mode_z)
-{
-    _address_mode_z = address_mode_z;
-    return *this;
-}
-
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TileInfo.cpp b/compute_kernel_writer/prototype/src/TileInfo.cpp
deleted file mode 100644
index 273266eedc..0000000000
--- a/compute_kernel_writer/prototype/src/TileInfo.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TileInfo.h"
-
-namespace ckw
-{
-TileInfo::TileInfo(DataType dt) : _dt(dt), _shape({{1, 1}})
-{
-}
-
-TileInfo::TileInfo(DataType dt, int32_t w) : _dt(dt), _shape({{w, 1}})
-{
-}
-
-TileInfo::TileInfo(DataType dt, int32_t h, int32_t w) : _dt(dt), _shape({{w, h}})
-{
-}
-
-TileInfo &TileInfo::width(int32_t w)
-{
-    _shape[kTileWidthIdx] = w;
-    return *this;
-}
-
-int32_t TileInfo::width() const
-{
-    return _shape[kTileWidthIdx];
-}
-
-TileInfo &TileInfo::height(int32_t h)
-{
-    _shape[kTileHeightIdx] = h;
-    return *this;
-}
-
-int32_t TileInfo::height() const
-{
-    return _shape[kTileHeightIdx];
-}
-
-TileInfo &TileInfo::data_type(DataType dt)
-{
-    _dt = dt;
-    return *this;
-}
-
-DataType TileInfo::data_type() const
-{
-    return _dt;
-}
-} // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
deleted file mode 100644
index e09c833d96..0000000000
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ckw/TileOperand.h"
-
-#include "ckw/Error.h"
-
-#include "src/Prototype.h"
-
-namespace ckw
-{
-
-TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name), _info(info), _value{std::vector<std::string>{"0"}}, _constant(false)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name), _info(TileInfo{data_type}), _value{std::vector<std::string>{"0"}}, _constant(false)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, int32_t value)
-    : OperandBase(name),
-      _info(TileInfo{DataType::Int32}),
-      _value{std::vector<std::string>{std::to_string(value)}},
-      _constant(true)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, float value)
-    : OperandBase(name),
-      _info(TileInfo{DataType::Fp32}),
-      _value{std::vector<std::string>{std::to_string(value)}},
-      _constant(true)
-{
-}
-
-TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
-    : OperandBase(name),
-      _info(TileInfo{dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size())}),
-      _value(vals),
-      _constant(true)
-{
-}
-
-prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter *writer) const
-{
-    CKW_UNUSED(writer);
-
-    if (_constant)
-    {
-        if (is_scalar())
-        {
-            switch (_info.data_type())
-            {
-                case DataType::Int32:
-                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
-
-                case DataType::Fp32:
-                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp32);
-
-                case DataType::Fp16:
-                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp16);
-
-                default:
-                    CKW_ASSERT(false);
-            }
-        }
-        else
-        {
-            return prototype::Operand(name());
-        }
-    }
-    else
-    {
-        return prototype::Operand(name(), prototype::OperandType::Tile);
-    }
-}
-
-const TileInfo &TileOperand::tile_info() const
-{
-    return _info;
-}
-
-DataType TileOperand::data_type() const
-{
-    return _info.data_type();
-}
-
-bool TileOperand::is_constant() const
-{
-    return _constant;
-}
-
-bool TileOperand::is_scalar() const
-{
-    return _info.width() == 1 && _info.height() == 1;
-}
-
-std::string TileOperand::scalar_value() const
-{
-    CKW_ASSERT(is_scalar());
-    CKW_ASSERT(is_constant());
-
-    return _value[0][0];
-}
-
-const TileContainer &TileOperand::value() const
-{
-    return _value;
-}
-
-} // namespace ckw
diff --git a/filelist.json b/filelist.json
index 2f33b5cd5e..dcf3204ecd 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2324,7 +2324,6 @@
         "src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp",
         "src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp",
-        "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
         "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
@@ -2339,8 +2338,6 @@
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
@@ -2361,21 +2358,6 @@
         "src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp",
         "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp"
       ],
-      "template_writer": [
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
-        "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp"
-      ],
       "ckw_driver": [
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 1e1ab7f545..f244017dbd 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2017-2023 Arm Limited.
+# Copyright (c) 2017-2024 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -28,7 +28,7 @@ import re
 import sys
 
 def get_list_includes():
-    return "compute_kernel_writer/prototype/include " \
+    return "compute_kernel_writer/include " \
            "src/cpu/kernels/assembly " \
            "src/core/NEON/kernels/assembly " \
            "src/core/NEON/kernels/convolution/winograd " \
@@ -43,8 +43,6 @@ def get_list_flags( filename, arch):
     flags.append("-DARM_COMPUTE_OPENCL_ENABLED")
     if arch == "aarch64":
         flags.append("-DARM_COMPUTE_AARCH64_V8_2")
-    if "ckw_driver" in filename:
-        flags.append("-DACL_INTERNAL_TEST_CKW_IN_DF")
 
     return flags
 
diff --git a/scripts/generate_android_bp.py b/scripts/generate_android_bp.py
index f7ecbc468b..6efd072acd 100755
--- a/scripts/generate_android_bp.py
+++ b/scripts/generate_android_bp.py
@@ -108,6 +108,7 @@ cc_library_static {
     proprietary: true,
     local_include_dirs: ["build/android-arm64v8a/src/core",
                          "build/android-arm64v8a/src/core/CL",
+                         "compute_kernel_writer/include",
                          "src/core/common",
                          "src/core/helpers",
                          "src/core/NEON/kernels/arm_gemm",
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 9ca20fa152..eab5cddd07 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,11 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 
 #include "src/core/CL/CLUtils.h"
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
 #include "support/Cast.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -61,128 +60,6 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe
     _arguments = code.arguments();
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-
-inline void ClKernelRuntime::add_tensor_argument(unsigned int                &idx,
-                                                 const GpuKernelArgumentInfo &arg,
-                                                 const ICLTensor             *tensor,
-                                                 const Window                &arg_slice,
-                                                 std::vector<cl::Image2D>    &cl_images)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-
-    switch (arg.type)
-    {
-        case GpuKernelArgumentInfo::Type::Scalar:
-        {
-            ARM_COMPUTE_ERROR("Unsupported yet");
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Vector:
-        {
-            add_1D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Image:
-        {
-            add_2D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D:
-        {
-            add_2D_tensor_argument(idx, tensor, arg_slice);
-            const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
-                                                                            tensor->info()->dimension(2) *
-                                                                            tensor->info()->dimension(3));
-            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d =
-                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
-                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
-            cl_images.push_back(tensor_image2d);
-            _kernel.setArg(idx++, tensor_image2d);
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Image_3D:
-        {
-            add_2D_tensor_argument(idx, tensor, arg_slice);
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
-                                                                            tensor->info()->dimension(2) *
-                                                                            tensor->info()->dimension(3));
-            const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d =
-                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
-                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
-            cl_images.push_back(tensor_image2d);
-            _kernel.setArg(idx++, tensor_image2d);
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Tensor_3D:
-        {
-            add_3D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-
-        case GpuKernelArgumentInfo::Type::Tensor_4D:
-        {
-            add_4D_tensor_argument(idx, tensor, arg_slice);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
-        {
-            add_4d_tensor_nhwc_argument(idx, tensor);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-        {
-            const size_t image_w        = tensor->info()->dimension(0) / 4;
-            const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
-            const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
-
-            cl::Image2D tensor_image2d = create_image2d_from_buffer(
-                CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h),
-                tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
-            cl_images.push_back(tensor_image2d);
-
-            _kernel.setArg(idx++, tensor_image2d);
-            add_4d_tensor_nhwc_argument(idx, tensor);
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_Special_0:
-        {
-            const ITensorInfo *info    = tensor->info();
-            const Strides     &strides = info->strides_in_bytes();
-
-            _kernel.setArg(idx++, tensor->cl_buffer());
-            const size_t dim1xdim2 = info->tensor_shape()[1] * info->tensor_shape()[2];
-            _kernel.setArg<cl_int>(idx++, static_cast<int32_t>(dim1xdim2));
-            const size_t stride1 = strides[1];
-            _kernel.setArg<cl_int>(idx++, static_cast<int32_t>(stride1));
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported");
-        }
-    }
-}
-
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
                                                  const GpuKernelArgumentBinding &arg,
                                                  const ICLTensor                *tensor,
@@ -234,7 +111,6 @@ inline void ClKernelRuntime::add_kernel_argument(unsigned int
     }
 }
 
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -253,17 +129,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
         // Set kernel arguments
         // CLImages created from tensor arguments. Need to be retained until enqueue
         std::vector<cl::Image2D> cl_images;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        for (auto id_arg : _arguments)
-        {
-            const auto arg    = id_arg.second;
-            auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
-            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
-            ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
-            add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, slice, cl_images);
-        }
 
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
         for (const auto &arg : _arguments)
         {
             auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
@@ -271,7 +137,6 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
             add_kernel_argument(idx, arg, tensor, cl_images);
         }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
index e78567eb9d..148e4db581 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
-#define SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
@@ -59,21 +59,6 @@ public:
     virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Set a kernel tensor argument
-     *
-     * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     arg       Kernel argument descriptor accompanying @p tensor
-     * @param[in]     tensor    Tensor to set as an argument of the object's kernel
-     * @param[in]     arg_slice Window the kernel will be run on
-     * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
-     */
-    inline void add_tensor_argument(unsigned int                &idx,
-                                    const GpuKernelArgumentInfo &arg,
-                                    const ICLTensor             *tensor,
-                                    const Window                &arg_slice,
-                                    std::vector<cl::Image2D>    &cl_images);
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Set a kernel argument as part of a tensor
      *
      * @param[in,out] idx       Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -85,7 +70,6 @@ private:
                                     const GpuKernelArgumentBinding &arg,
                                     const ICLTensor                *tensor,
                                     std::vector<cl::Image2D>       &cl_images);
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 private:
     GpuKernelArgumentList _arguments{};
@@ -94,4 +78,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME */
+#endif // ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
deleted file mode 100644
index 9cecfc2ffd..0000000000
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1)
-{
-    return info0.type == info1.type;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 03817173f4..c923bf9c16 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
 
 #include "arm_compute/core/TensorInfo.h"
 
@@ -32,96 +32,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-/** Contain information required to set up a kernel argument at run time
- * @deprecated To be removed along with ClTemplateWriter
- */
-struct GpuKernelArgumentInfo
-{
-    /** Enumerate all the tensor arguments variants used by all kernel implementations.  */
-    enum class Type : int
-    {
-        Scalar,
-
-        Vector,
-
-        Image,
-        Image_Reinterpret_As_3D,
-        Image_Export_To_ClImage2D,
-
-        Image_3D, // 3D Tensor represented as a 2D Image + stride_z
-        Image_3D_Export_To_ClImage2D,
-
-        Tensor_3D,
-        Tensor_4D,
-        Tensor_4D_t_Buffer,
-        Tensor_4D_t_Image,
-
-        Tensor_Special_0,
-    };
-    /** Default constructor */
-    GpuKernelArgumentInfo() = default;
-    /** Constructor */
-    GpuKernelArgumentInfo(Type type) : type{type}
-    {
-    }
-    Type type{Type::Tensor_4D_t_Buffer};
-};
-bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
-/** Kernel argument information linked with its corresponding @ref ITensorInfo
- * @deprecated To be removed along with ClTemplateWriter
- */
-class GpuKernelArgument
-{
-public:
-    /** Constructor
-     *
-     * @param[in] tensor_info     Associated @ref ITensorInfo
-     * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
-     */
-    GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info}
-    {
-    }
-    /** Get workload tensor id */
-    ITensorInfo::Id id() const
-    {
-        return _tensor_info.id();
-    }
-    /** Get associated @ref ITensorInfo */
-    ITensorInfo *tensor_info()
-    {
-        return &_tensor_info;
-    }
-    /** Get associated @ref ITensorInfo */
-    const ITensorInfo *tensor_info() const
-    {
-        return &_tensor_info;
-    }
-    /** Get associated @ref GpuKernelArgumentInfo */
-    GpuKernelArgumentInfo *kernel_argument_info()
-    {
-        return &_kernel_arg_info;
-    }
-    /** Get associated @ref GpuKernelArgumentInfo */
-    const GpuKernelArgumentInfo *kernel_argument_info() const
-    {
-        return &_kernel_arg_info;
-    }
-    /** Check if the associated workload tensor has valid id
-     *
-     * @return true if has valid id
-     * @return false  otherwise
-     */
-    bool has_valid_id() const
-    {
-        return _tensor_info.has_valid_id();
-    }
-
-private:
-    TensorInfo            _tensor_info{};
-    GpuKernelArgumentInfo _kernel_arg_info{};
-};
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 /** Describe how the tensor runtime memory can be accessed
  *
  * Please see documentation under @ref GpuKernelArgumentBinding
@@ -243,9 +153,8 @@ private:
     };
     Value _value;
 };
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 24812cd8a7..11d916eec9 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,19 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include <map>
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 #include <deque>
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include <string>
 
 namespace arm_compute
@@ -43,11 +39,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 /** The argument list of a @ref GpuKernelSourceCode */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 /** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
  */
@@ -132,4 +124,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index 502ceab807..725a46e91c 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,14 +26,10 @@
 #include "arm_compute/core/experimental/Types.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -41,8 +37,8 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
-    : _comp_group{components}, _store_components{}
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components) // NOLINT
+    : _comp_group{std::move(components)}, _store_components{}
 {
     ARM_COMPUTE_UNUSED(services);
 }
@@ -50,19 +46,11 @@ GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKern
 GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
 {
     GpuKernelSourceCode code;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    ClTemplateWriter writer{_comp_group};
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuCkwDriver writer{_comp_group};
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+    GpuCkwDriver        writer{_comp_group};
 
     code.name(writer.get_name());
     code.code(writer.get_code());
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    code.arguments(writer.get_tensors());
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     code.arguments(writer.get_kernel_arguments());
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     code.build_options(writer.get_build_options());
     code.config_id(writer.get_config_id());
     code.window(writer.get_window());
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
index 1fd40f0acd..e2bc83b286 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
@@ -52,7 +52,7 @@ public:
      * @param[in] services   @ref GpuComponentServices to be used
      * @param[in] components Component group from which this logical kernel is initialized
      */
-    explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components);
+    explicit GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components); // NOLINT
     /** Allow instances of this class to be copy constructed */
     GpuLogicalKernel(const GpuLogicalKernel &) = default;
     /** Allow instances of this class to be copied */
@@ -71,4 +71,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index 43bcc47fa0..5d75bcaaa0 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
@@ -36,7 +36,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 namespace
 {
 /** Extract kernel arguments of one tensor from a flat list of kernel arguments.
@@ -70,7 +69,6 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &
     return tensor_kargs;
 }
 } // namespace
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
 
@@ -83,25 +81,11 @@ class GpuWorkloadArgument
 public:
     /** Default constructor */
     GpuWorkloadArgument() = default;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Constructor
      *
-     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
-     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
-     * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument
-     */
-    GpuWorkloadArgument(const ITensorInfo           &tensor_info,
-                        const MemoryDescriptor      &mem_desc,
-                        const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info}
-    {
-    }
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Constructor
-     *
-     * @param[in] tensor_info     @ref ITensorInfo of the workload argument
-     * @param[in] mem_desc        @ref MemoryDescriptor of the workload argument
-     * @param[in] kernel_arg_list @ref GpuKernelArgumentList of the workload argument
+     * @param[in] tensor_info @ref ITensorInfo of the workload argument
+     * @param[in] mem_desc    @ref MemoryDescriptor of the workload argument
+     * @param[in] kernel_args @ref GpuKernelArgumentList of the workload argument
      */
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
@@ -109,7 +93,6 @@ public:
         : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
     {
     }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get tensor id within workload */
     ITensorInfo::Id id() const
     {
@@ -135,18 +118,6 @@ public:
     {
         return &_mem_desc;
     }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Get @ref GpuKernelArgumentInfo of the argument */
-    GpuKernelArgumentInfo *kernel_argument_info()
-    {
-        return &_kernel_arg_info;
-    }
-    /** Get @ref GpuKernelArgumentInfo of the argument */
-    const GpuKernelArgumentInfo *kernel_argument_info() const
-    {
-        return &_kernel_arg_info;
-    }
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get @ref GpuKernelArgumentList of the workload tensor */
     GpuKernelArgumentList *kernel_argument_list()
     {
@@ -157,7 +128,6 @@ public:
     {
         return &_kernel_args;
     }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Check if the workload argument has valid id
      *
      * @return true   If has valid id
@@ -169,13 +139,9 @@ public:
     }
 
 private:
-    TensorInfo       _tensor_info{};
-    MemoryDescriptor _mem_desc{};
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentInfo _kernel_arg_info{};
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    TensorInfo            _tensor_info{};
+    MemoryDescriptor      _mem_desc{};
     GpuKernelArgumentList _kernel_args{};
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 /** Describes when a unit workload is run.
@@ -259,22 +225,7 @@ public:
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
         const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
         _unit_workloads.push_back(unit_work);
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        ARM_COMPUTE_UNUSED(context);
-        // Assemble kernel argument with memory descriptor to form workload argument
-        for (const auto &id_arg : kernel_code.arguments())
-        {
-            const auto arg_id = id_arg.first;
-            const auto arg    = id_arg.second;
-            _workload_arguments[arg_id] =
-                GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()};
-            if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
-            {
-                _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
-            }
-            _tensor_uwork_map[arg_id].insert(uwk_id);
-        }
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+
         GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
         GpuKernelArgumentList tensor_kargs{};
         while (true)
@@ -296,7 +247,7 @@ public:
                 _tensor_uwork_map[tensor_id].insert(uwk_id);
             }
         }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
         return uwk_id;
     }
     /** Get a unit workload from its id */
@@ -346,4 +297,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index ad474674f9..84972501de 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
@@ -62,23 +62,14 @@ public:
     virtual std::string get_config_id() = 0;
     /** Generate execution window */
     virtual Window get_window() const = 0;
-    /** Get the kernel argument lists of the kernel
-     * @deprecated To be removed along with ClTemplateWriter
-     */
-    virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors()
-    {
-        return {};
-    }
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get the flat list of arguments of the kernel*/
     virtual GpuKernelArgumentList get_kernel_arguments()
     {
         return {};
     }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index b80ce0d816..f8770920b7 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -24,15 +24,12 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
 
-#include "ckw/Kernel.h"
-
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
 #include "compute_kernel_writer/include/ckw/Kernel.h"
 #include "compute_kernel_writer/include/ckw/KernelArgument.h"
-#include <map>
 #include <string>
 
 namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
index f1f0e6747b..c9ce7eb269 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
 
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
@@ -33,8 +33,6 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
- */
 class GpuCkwStore : public IGpuCkwComponentDriver
 {
 public:
@@ -61,4 +59,4 @@ private:
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index 4b8eea2f57..6678c929e9 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
@@ -100,10 +100,6 @@ public:
         return _properties;
     }
     /** Get writer for the component */
-    virtual const IGpuTemplateComponentWriter *template_writer() const
-    {
-        return nullptr;
-    }
     virtual const IGpuCkwComponentDriver *ckw_component_driver() const
     {
         return nullptr;
@@ -119,4 +115,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index fdf528a65d..e316bdf46d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,7 @@
 #include "ClComponentActivation.h"
 
 #include "src/core/CL/CLValidate.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -69,11 +65,7 @@ ClComponentActivation::ClComponentActivation(ComponentId
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateActivation>(id, tensors, attributes)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -81,11 +73,7 @@ ClComponentActivation::~ClComponentActivation()
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentActivation::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index 02c854356a..b8185158f3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
@@ -41,11 +41,7 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateActivation;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwActivation;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentActivation final : public IGpuKernelComponent
 {
@@ -106,11 +102,7 @@ public:
     ClComponentActivation &operator=(ClComponentActivation &&component) = default;
 
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver     *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
 
     /** Get component type */
     GpuComponentType type() const override
@@ -119,13 +111,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateActivation> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwActivation> _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index b1636795a3..e1850d78c4 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,7 @@
 
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -72,22 +68,16 @@ ClComponentCast::ClComponentCast(ComponentId                      id,
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateCast>(id, tensors, attributes)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
 }
+
 ClComponentCast::~ClComponentCast()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentCast::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
+
 const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index ed77b1203b..201dacc288 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
 
@@ -49,11 +49,7 @@ private:
 };
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateCast;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwCast;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentCast final : public IGpuKernelComponent
 {
@@ -120,11 +116,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentCast &operator=(ClComponentCast &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -132,14 +124,10 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateCast> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<GpuCkwCast>   _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwCast> _component_writer;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
index ca8037c393..7cd23d6115 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,11 +28,7 @@
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -212,22 +208,14 @@ ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId
                                                        const Attributes                &attributes,
                                                        const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwDepthwiseConv2d>(id, tensors, attributes, settings)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);
 }
 ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentDepthwiseConv2d::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDepthwiseConv2d::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
index 01168e9ded..7526361f1c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,11 +44,7 @@ class ArgumentPack;
 class DepthwiseConv2dAttributes;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateDepthwiseConv2d;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwDepthwiseConv2d;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 /** Component specific settings
  */
@@ -161,13 +157,8 @@ public:
     ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default;
-    /** Get template writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver          *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
-
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -175,11 +166,7 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateDepthwiseConv2d> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwDepthwiseConv2d> _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index 98f3d6a882..783a17df30 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,12 +28,7 @@
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
-
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -153,11 +148,7 @@ ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId
                                                  const Attributes                &attributes,
                                                  const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings)}
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -165,11 +156,7 @@ ClComponentDirectConv2d::~ClComponentDirectConv2d()
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index d6d9705d3c..c50b0fa0ce 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -68,11 +68,7 @@ private:
 };
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateDirectConv2d;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwDirectConv2d;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentDirectConv2d final : public IGpuKernelComponent
 {
@@ -139,11 +135,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver       *ckw_component_driver() const override;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -151,13 +143,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 5b136427e4..209c73dbee 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,7 @@
 #include "arm_compute/core/Validate.h"
 
 #include "src/core/CL/CLValidate.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -117,19 +113,11 @@ ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId
                                                            const ArgumentPack<ITensorInfo> &tensors,
                                                            const Attributes                &attributes)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index 7589b9732c..a4395a6219 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -40,11 +40,7 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateElementwiseBinary;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwElementwiseBinary;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentElementwiseBinary final : public IGpuKernelComponent
 {
@@ -105,12 +101,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver            *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
-
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -118,13 +109,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateElementwiseBinary> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwElementwiseBinary> _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
deleted file mode 100644
index 27c13bd654..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties                &properties,
-                                                   const ArgumentPack<ITensorInfo> &tensors,
-                                                   const Attributes                &attributes)
-{
-    ARM_COMPUTE_UNUSED(properties, attributes);
-
-    const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensorInfo *sum = tensors.get_const_tensor(TensorType::ACL_DST_0);
-    const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_1);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-
-    // 1. Check validity
-    // All tensor infos are initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(sum->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-
-    // Check for mismatches in shapes and data types
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    // Device requirements are met
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // 2. Check support level
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    return Status{};
-}
-
-ClComponentLogits1DMaxShiftExpSum::ClComponentLogits1DMaxShiftExpSum(ComponentId                      id,
-                                                                     const Properties                &properties,
-                                                                     const ArgumentPack<ITensorInfo> &tensors,
-                                                                     const Attributes                &attributes)
-    : IGpuKernelComponent{id, properties, tensors},
-      _component_writer{std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes)}
-{
-}
-
-ClComponentLogits1DMaxShiftExpSum::~ClComponentLogits1DMaxShiftExpSum()
-{
-}
-
-const IGpuTemplateComponentWriter *ClComponentLogits1DMaxShiftExpSum::template_writer() const
-{
-    return _component_writer.get();
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
deleted file mode 100644
index 91ab5de3b5..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
-
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
-namespace arm_compute
-{
-/** Forward declaration */
-class ITensorInfo;
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Forward declaration */
-template <typename T>
-class ArgumentPack;
-
-/** Forward declaration */
-class ClTemplateLogits1DMaxShiftExpSum;
-
-/** Component to calculate max-shifted exponentials and their sum
- *
- *  1D example:
- *      input:  [x1, x2, ... , xn], shape: (1 x d)
- *
- *      Let max(x1...xn) = m
- *
- *      (output) sum: [exp(x1-m) + ... + exp(xn-m)], shape: (1 x 1)
- *      (output) dst: [exp(x1-m) ... exp(xn-m)], shape: (1 x d)
- *
- *  This component is used by the softmax operator. The subsequent
- *  operation normalizes dst with sum, therefore the max-shifting
- *  since exp(m) will be cancelled in numerator and denominator.
-*/
-class ClComponentLogits1DMaxShiftExpSum final : public IGpuKernelComponent
-{
-public:
-    /** Attributes are a set of backend-agnostic parameters that define what a component does */
-    using Attributes = SoftmaxAttributes;
-
-    /** Validate the component
-     *
-     * @param[in] properties Component properties @ref Properties
-     * @param[in] tensors    Tensor arguments to the component
-     * @param[in] attributes Component attributes @ref Attributes
-     *
-     * @return Status        Validation results
-     *
-     * Tensor argument names:
-     * - ACL_SRC_0: Input
-     * - ACL_DST_0: Output
-     * - ACL_DST_1: Output
-     *
-     * Tensor argument constness:
-     * - ACL_SRC_0: Const
-     * - ACL_DST_0: Const
-     * - ACL_DST_1: Const
-     *
-     * Valid data layouts:
-     * - All
-     *
-     ** Valid data type configurations:
-     * |ACL_SRC_0  |ACL_DST_0  |ACL_DST_1  |
-     * |:----------|:----------|:----------|
-     * |F16        | F16       | F16       |
-     * |F32        | F32       | F32       |
-     */
-    static Status
-    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Constructor
-     *
-     * Similar to @ref ClComponentLogits1DMaxShiftExpSum::validate()
-     */
-    ClComponentLogits1DMaxShiftExpSum(ComponentId                      id,
-                                      const Properties                &properties,
-                                      const ArgumentPack<ITensorInfo> &tensors,
-                                      const Attributes                &attributes);
-
-    /** Destructor */
-    ~ClComponentLogits1DMaxShiftExpSum() override;
-    /** Prevent instances of this class from being copy constructed */
-    ClComponentLogits1DMaxShiftExpSum(const ClComponentLogits1DMaxShiftExpSum &component) = delete;
-    /** Prevent instances of this class from being copied */
-    ClComponentLogits1DMaxShiftExpSum &operator=(const ClComponentLogits1DMaxShiftExpSum &component) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClComponentLogits1DMaxShiftExpSum(ClComponentLogits1DMaxShiftExpSum &&component) = default;
-    /** Allow instances of this class to be moved */
-    ClComponentLogits1DMaxShiftExpSum &operator=(ClComponentLogits1DMaxShiftExpSum &&component) = default;
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
-    /** Get component type */
-    GpuComponentType type() const override
-    {
-        return GpuComponentType::Unfusable;
-    }
-
-private:
-    std::unique_ptr<ClTemplateLogits1DMaxShiftExpSum> _component_writer;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
deleted file mode 100644
index fb2544385c..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-Status ClComponentLogits1DNorm::validate(const Properties                &properties,
-                                         const ArgumentPack<ITensorInfo> &tensors,
-                                         const Attributes                &attributes)
-{
-    ARM_COMPUTE_UNUSED(properties, attributes);
-
-    const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    const ITensorInfo *sum = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-
-    // 1. Check validity
-    // All tensor infos are initialized
-    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(sum->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-
-    // Check for mismatches in shapes and data types
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst, sum);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.is_log_softmax() && !is_data_type_float(src->data_type()));
-
-    // Device requirements are met
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
-    // 2. Check support level
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-
-    return Status{};
-}
-
-ClComponentLogits1DNorm::ClComponentLogits1DNorm(ComponentId                      id,
-                                                 const Properties                &properties,
-                                                 const ArgumentPack<ITensorInfo> &tensors,
-                                                 const Attributes                &attributes)
-    : IGpuKernelComponent{id, properties, tensors},
-      _component_writer{std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes)}
-{
-}
-
-ClComponentLogits1DNorm::~ClComponentLogits1DNorm()
-{
-}
-
-const IGpuTemplateComponentWriter *ClComponentLogits1DNorm::template_writer() const
-{
-    return _component_writer.get();
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
deleted file mode 100644
index 74c0273604..0000000000
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
-
-#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
-namespace arm_compute
-{
-/** Forward declaration */
-class ITensorInfo;
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Forward declaration */
-template <typename T>
-class ArgumentPack;
-
-/** Forward declaration */
-class ClTemplateLogits1DNorm;
-
-/** Component to calculate the final step of the Softmax Layer
- * where each logit value is multiplied by the inverse of the sum of the logits.
- *
- *  1D example:
- *
- *      (input)  src: [x1 x2 ... xn], shape: (1 x d)
- *      (input)  sum: [x1 + x2 + ... + xn], shape: (1 x 1)
- *      (output) dst: [x1/sum x2/sum ... xn/sum], shape: (1 x d)
- *
- *  This component is used by the softmax operator to get the final result.
-*/
-class ClComponentLogits1DNorm final : public IGpuKernelComponent
-{
-public:
-    /** Attributes are a set of backend-agnostic parameters that define what a component does */
-    using Attributes = SoftmaxAttributes;
-
-    /** Validate the component
-     *
-     * @param[in] properties Component properties @ref Properties
-     * @param[in] tensors    Tensor arguments to the component
-     * @param[in] attributes Component attributes @ref Attributes
-     *
-     * @return Status        Validation results
-     *
-     * Tensor argument names:
-     * - ACL_SRC_0: Input
-     * - ACL_SRC_1: Input
-     * - ACL_DST_0: Output
-     *
-     * Tensor argument constness:
-     * - ACL_SRC_0: Const
-     * - ACL_SRC_1: Const
-     * - ACL_DST_0: Const
-     *
-     * Valid data layouts:
-     * - All
-     *
-     ** Valid data type configurations:
-     * |ACL_SRC_0  |ACL_SRC_1  |ACL_DST_0  |
-     * |:----------|:----------|:----------|
-     * |F16        | F16       | F16       |
-     * |F32        | F32       | F32       |
-     */
-    static Status
-    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Constructor
-     *
-     * Similar to @ref ClComponentLogits1DNorm::validate()
-     */
-    ClComponentLogits1DNorm(ComponentId                      id,
-                            const Properties                &properties,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
-
-    /** Destructor */
-    ~ClComponentLogits1DNorm() override;
-    /** Prevent instances of this class from being copy constructed */
-    ClComponentLogits1DNorm(const ClComponentLogits1DNorm &component) = delete;
-    /** Prevent instances of this class from being copied */
-    ClComponentLogits1DNorm &operator=(const ClComponentLogits1DNorm &component) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClComponentLogits1DNorm(ClComponentLogits1DNorm &&component) = default;
-    /** Allow instances of this class to be moved */
-    ClComponentLogits1DNorm &operator=(ClComponentLogits1DNorm &&component) = default;
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
-    /** Get component type */
-    GpuComponentType type() const override
-    {
-        return GpuComponentType::Unfusable;
-    }
-
-private:
-    std::unique_ptr<ClTemplateLogits1DNorm> _component_writer;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
index f238d42d98..53ac8da41f 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
 
@@ -147,5 +146,3 @@ const IGpuCkwComponentDriver *ClComponentMatMul::ckw_component_driver() const
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index 5544963b3f..6e7243dc04 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -30,7 +30,6 @@
 
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 #include <memory>
@@ -93,27 +92,16 @@ ClComponentPool2d::ClComponentPool2d(ComponentId                      id,
                                      const Attributes                &attributes,
                                      const Settings                  &settings)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 ClComponentPool2d::~ClComponentPool2d()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentPool2d::template_writer() const
-{
-    return _component_writer.get();
-}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentPool2d::ckw_component_driver() const
 {
     return _component_writer.get();
 }
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
index 98fed65004..d33e601f18 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,11 +42,7 @@ class ArgumentPack;
 class Pool2dAttributes;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplatePool2d;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwPool2d;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentPool2d final : public IGpuKernelComponent
 {
@@ -116,13 +112,9 @@ public:
 
     /** Allow instances of this class to be moved */
     ClComponentPool2d &operator=(ClComponentPool2d &&component) = default;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+
     /** Get GPU kernel writer for the component */
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -131,11 +123,7 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplatePool2d> _component_writer;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwPool2d> _component_writer;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
index 0ece9de970..dce85c424e 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Validate.h"
 
 #include "src/core/CL/CLValidate.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h"
 
 namespace arm_compute
 {
@@ -54,15 +53,16 @@ Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors)
 ClComponentReshape::ClComponentReshape(ComponentId                      id,
                                        const Properties                &properties,
                                        const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<ClTemplateReshape>(id, tensors)}
+    : IGpuKernelComponent{id, properties, tensors}
 {
 }
 ClComponentReshape::~ClComponentReshape()
 {
 }
-const IGpuTemplateComponentWriter *ClComponentReshape::template_writer() const
+const IGpuCkwComponentDriver *ClComponentReshape::ckw_component_driver() const
 {
-    return _component_writer.get();
+    /* NOT IMPLEMENTED */
+    return nullptr;
 }
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
index 78163d6603..fd0f966da1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
@@ -85,8 +85,8 @@ public:
     ClComponentReshape(ClComponentReshape &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentReshape &operator=(ClComponentReshape &&component) = default;
-    /** Get template writer for the component */
-    const IGpuTemplateComponentWriter *template_writer() const override;
+    /** Get writer for the component */
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -94,10 +94,9 @@ public:
     }
 
 private:
-    std::unique_ptr<ClTemplateReshape> _component_writer;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
index b05eb04698..411eeca802 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,12 +29,7 @@
 #include "src/core/CL/CLValidate.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h"
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -43,11 +38,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateResize;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwResize;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 Status ClComponentResize::validate(const IGpuKernelComponent::Properties &properties,
                                    const ArgumentPack<ITensorInfo>       &tensors,
@@ -82,11 +73,7 @@ ClComponentResize::ClComponentResize(ComponentId                            id,
                                      const ArgumentPack<ITensorInfo>       &tensors,
                                      const ClComponentResize::Attributes   &attributes)
     : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateResize>(id, tensors, attributes)}
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
 
@@ -94,11 +81,7 @@ ClComponentResize::~ClComponentResize()
 {
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
index 29276c3257..9a1169c45f 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,8 @@
  * SOFTWARE.
  */
 
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
 
@@ -42,11 +42,7 @@ template <typename T>
 class ArgumentPack;
 
 /** Forward declaration */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateResize;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwResize;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentResize final : public IGpuKernelComponent
 {
@@ -111,11 +107,7 @@ public:
     ClComponentResize &operator=(ClComponentResize &&component) = default;
 
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     /** Get component type */
     GpuComponentType type() const override
@@ -124,15 +116,11 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateResize> _component_writer;
-#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwResize> _component_writer;
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index dcbecaff35..3db6c5cd2d 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,7 @@
 #include "ClComponentStore.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
-#else //ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include <memory>
 
@@ -46,22 +42,13 @@ Status ClComponentStore::validate(const Properties &properties, const ArgumentPa
 ClComponentStore::ClComponentStore(ComponentId                      id,
                                    const Properties                &properties,
                                    const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{id, properties, tensors},
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<ClTemplateStore>(id, tensors)}
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
 {
 }
 ClComponentStore::~ClComponentStore()
 {
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index 948785c480..2c1dd0f6fc 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
@@ -39,11 +39,7 @@ namespace dynamic_fusion
 /** Forward declaration */
 template <typename T>
 class ArgumentPack;
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-class ClTemplateStore;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwStore;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentStore final : public IGpuKernelComponent
 {
@@ -88,11 +84,7 @@ public:
     /** Allow instances of this class to be moved */
     ClComponentStore &operator=(ClComponentStore &&component) = default;
     /** Get writer for the component */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuTemplateComponentWriter *template_writer() const override;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -100,13 +92,9 @@ public:
     }
 
 private:
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<ClTemplateStore> _component_writer;
-#else  //ACL_INTERNAL_TEST_CKW_IN_DF
-    std::unique_ptr<GpuCkwStore>  _component_writer;
-#endif //ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwStore> _component_writer;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
index 697b7d4e1f..4d6e7f81bb 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
 namespace arm_compute
 {
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
index e24629a036..2997b28ec1 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
 
@@ -244,4 +243,3 @@ ITensorInfo *GpuMatMul::create_op(GpuWorkloadSketch &sketch,
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
index 431c9110fc..d385752201 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
@@ -28,8 +28,6 @@
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
@@ -88,9 +86,8 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
         arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
         arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(
-            ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes));
+        ARM_COMPUTE_UNUSED(properties, attributes);
+        return Status(ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not implemented");
     }
     else
     {
@@ -177,8 +174,8 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorI
             arguments_norm.add_const_tensor(ACL_SRC_1, sum);
             arguments_norm.add_const_tensor(ACL_DST_0, dst);
 
-            comp_graph.add_new_component<ClComponentLogits1DMaxShiftExpSum>(properties, arguments_exp_sum, attributes);
-            comp_graph.add_new_component<ClComponentLogits1DNorm>(properties, arguments_norm, attributes);
+            // Add to component graph -- NOT IMPLEMENTED
+            ARM_COMPUTE_UNUSED(comp_graph, attributes);
         }
     }
     else
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
index bf0f274c5c..b9d01966b3 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,6 @@
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
 namespace arm_compute
 {
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
deleted file mode 100644
index 775b0a0c8c..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "GpuKernelVariableTable.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
-                                              const ITensorInfo             *tensor,
-                                              GpuKernelArgumentInfo          argument_info,
-                                              const std::string             &alias)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
-
-    // Do not re-declare if the variable associated with the tensor has already been declared
-    auto it = _vars.find(tensor->id());
-
-    if (it != _vars.end())
-    {
-        ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info));
-        return;
-    }
-
-    const auto target = comp_group.get_tile_for_tensor(tensor);
-
-    if (target != tensor)
-    {
-        // If the tensor uses a shared tile, don't declare another variable.
-        it = _vars.find(target->id());
-
-        ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared.");
-
-        _vars[tensor->id()] = it->second;
-    }
-    else
-    {
-        // Declare variable associated with the tensor
-        std::stringstream ss;
-        ss << alias << "_t" << abs(tensor->id());
-        const auto     uniq_name = ss.str();
-        TensorVariable var{tensor->id(), uniq_name, argument_info};
-
-        _vars.emplace(tensor->id(), var);
-    }
-}
-
-GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
-{
-    const auto var = _vars.at(tensor->id());
-    return var;
-}
-
-GpuKernelVariableTable::VariableList
-GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
-{
-    VariableList vars{};
-    for (const auto &tensor : tensors)
-    {
-        if (!tensor->has_valid_id())
-        {
-            continue;
-        }
-        vars.push_back(get_variable(tensor));
-    }
-    return vars;
-}
-
-TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name}
-{
-}
-
-TagVal::TagVal(const std::string &val) : value{val}
-{
-}
-
-TagVal::TagVal(const char *val) : value{std::string(val)}
-{
-}
-
-TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)}
-{
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
deleted file mode 100644
index c17f131ada..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
-
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "support/AclRequires.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class GpuKernelComponentGroup;
-
-/** A table of all the variables used in the kernel.
- * Each kernel has exactly one variable table.
- */
-class GpuKernelVariableTable
-{
-public:
-    /** A tensor variable whose main purposes are:
-     *  - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
-     *  - Hold the generated variable name for the associated tensor info
-     */
-    struct TensorVariable
-    {
-    public:
-        TensorVariable()                                        = default;
-        TensorVariable(const TensorVariable &)                  = default;
-        TensorVariable       &operator=(const TensorVariable &) = default;
-        ITensorInfo::Id       id{ITensorInfo::invalid_tensor_id};
-        std::string           uniq_name{"empty"}; // Unique name, also the final variable name used in the built code
-        GpuKernelArgumentInfo kernel_argument_info{};
-        bool                  has_valid_id() const
-        {
-            return id != ITensorInfo::invalid_tensor_id;
-        }
-    };
-    using VariableList = std::vector<TensorVariable>;
-
-public:
-    /** Declare a @ref TensorVariable for a corresponding tensor info.
-     *
-     * @param[in] comp_group    Component group the tensor belongs to
-     * @param[in] tensor        Tensor info with which the new variable is associated
-     * @param[in] argument_info Kernel argument information
-     * @param[in] alias         Alias for the variable. Will be used as part of the variable name
-     */
-    void declare_variable(const GpuKernelComponentGroup &comp_group,
-                          const ITensorInfo             *tensor,
-                          GpuKernelArgumentInfo          argument_info,
-                          const std::string             &alias = "unnamed");
-    /** Get the @ref TensorVariable associated with @p tensor
-     *
-     * @param[in] tensor Tensor info to be queried
-     *
-     * @return TensorVariable
-     */
-    TensorVariable get_variable(const ITensorInfo *tensor) const;
-    /** Get the @ref TensorVariable list associated with @p tensors
-     * @note Empty tensors are skipped
-     *
-     * @param[in] tensors List of tensor infos to be queried
-     *
-     * @return VariableList
-     */
-    VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
-
-private:
-    std::map<ITensorInfo::Id, TensorVariable> _vars{};
-};
-
-/** A tag value will substitute a tag in a string template during its instantiation */
-struct TagVal
-{
-    /** Default constructor */
-    TagVal() = default;
-    /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
-    TagVal(const GpuKernelVariableTable::TensorVariable &var);
-    /** Construct a @ref TagVal from an integral type */
-    template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
-    TagVal(T val) : value{support::cpp11::to_string(val)}
-    {
-    }
-    /** Construct a @ref TagVal from a string */
-    TagVal(const std::string &val);
-    /** Construct a @ref TagVal from a c-style string */
-    TagVal(const char *val);
-    /** Construct a @ref TagVal from a @ref DataType */
-    TagVal(const DataType &data_type);
-    /** Get the value of the TagVal as a converted string */
-    std::string value{};
-};
-
-/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
-using Tag = std::string;
-
-/** Tag lookup table. It is used to instantiate a string template */
-using TagLUT = std::unordered_map<Tag, TagVal>;
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
deleted file mode 100644
index 9d0b4f592a..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Forward declaration */
-class GpuKernelComponentGroup;
-class GpuKernelVariableTable;
-
-/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
- */
-class IGpuTemplateComponentWriter
-{
-public:
-    using ComponentGroup = GpuKernelComponentGroup;
-
-    /**For now all kernel intermeditate/destination tensors are expected to be of type Tensor_4D_t_Buffer*/
-    static constexpr GpuKernelArgumentInfo::Type common_tensor_type = GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-public:
-    /** Constructor
-     *
-     * @param[in] id      Component id
-     * @param[in] tensors Tensor arguments to the components
-     */
-    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
-    {
-    }
-    /** Destructor */
-    virtual ~IGpuTemplateComponentWriter()
-    {
-    }
-    /** Generate kernel component name */
-    virtual std::string get_name() const = 0;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
-    /** Generate additional macros used in the component */
-    virtual std::string get_additional_macros() const
-    {
-        return "";
-    }
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
-    {
-        ARM_COMPUTE_UNUSED(comp_group);
-        return CLBuildOptions{};
-    }
-    /** Generate the component config id string used for tuning */
-    virtual std::string get_config_id() const
-    {
-        return "";
-    }
-    /** Generate the header list used in the component */
-    virtual std::set<std::string> get_headers_list() const
-    {
-        return std::set<std::string>{};
-    }
-    /** Generate the execution window for the component */
-    virtual Window get_window() const
-    {
-        return Window{};
-    }
-    /** Get tensor arguments */
-    ArgumentPack<ITensorInfo> tensors() const
-    {
-        return _tensors;
-    }
-    /** Get component id */
-    ComponentId id() const
-    {
-        return _id;
-    }
-
-private:
-    ComponentId               _id{-1};
-    ArgumentPack<ITensorInfo> _tensors{};
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
deleted file mode 100644
index c165fb5f33..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateActivation.h"
-
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateActivation::ClTemplateActivation(ComponentId                      id,
-                                           const ArgumentPack<ITensorInfo> &tensors,
-                                           const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateActivation::get_name() const
-{
-    return "activation";
-}
-
-std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_group) const
-{
-    std::string code;
-    const bool  is_root = (comp_group.get_root_component()->id() == this->id());
-
-    code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    if (is_root)
-    {
-        code += R"_(
-// IN(src)              {{src}}
-// OUT(dst, accum)      {{dst}}
-
-TILE({{DATA_TYPE}}, M0, N0, {{src}});
-TILE(uint, M0, 1, g_dst_indirect_y);
-{
-    {{src}}_offset_first_element_in_bytes += g_ind_2 * {{src}}_stride_z;
-
-    T_LOAD({{DATA_TYPE}}, M0, N0, {{TENSOR_TYPE}}, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{src}});
-
-    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
-}
-
-LOOP_UNROLLING(int, i, 0, 1, M0,
-{
-    g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-    g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-    g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-})
-)_";
-    }
-    else
-    {
-        code += R"_(
-// IN/OUT(src, accum)   {{src}}
-
-{
-    T_ACTIVATION({{DATA_TYPE}}, M0, N0, {{ACT}}, {{A_VAL}}, {{B_VAL}}, {{src}}, {{dst}});
-}
-)_";
-    }
-    code += R"_(
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    return code;
-}
-
-void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["DATA_TYPE"]      = get_cl_type_from_data_type(_src->data_type());
-    lut["TENSOR_TYPE"]    = "BUFFER";
-
-    const auto f_act = lower_string(string_from_activation_func(_attributes.activation()));
-
-    lut["ACT"]   = f_act;
-    lut["A_VAL"] = float_to_string_with_full_precision(_attributes.a());
-    lut["B_VAL"] = float_to_string_with_full_precision(_attributes.b());
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateActivation::get_build_options(const ComponentGroup &comp_group) const
-{
-    /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateActivation::get_config_id() const
-{
-    std::string config_id{};
-    config_id += "activation_";
-    config_id += lower_string(string_from_data_type(_src->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(1));
-    return config_id;
-}
-
-std::set<std::string> ClTemplateActivation::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"};
-}
-
-Window ClTemplateActivation::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    const unsigned int n0  = adjust_vec_size(16 / _dst->element_size(), _dst->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
deleted file mode 100644
index 88ee370342..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEACTIVATION
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEACTIVATION
-
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/function_info/ActivationLayerInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateActivation final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentActivation::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Destructor */
-    ~ClTemplateActivation() override = default;
-
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateActivation(const ClTemplateActivation &activation) = delete;
-
-    /** Prevent instances of this class from being copied */
-    ClTemplateActivation &operator=(const ClTemplateActivation &activation) = delete;
-
-    /** Allow instances of this class to be move constructed */
-    ClTemplateActivation(ClTemplateActivation &&activation) = default;
-
-    /** Allow instances of this class to be moved */
-    ClTemplateActivation &operator=(ClTemplateActivation &&activation) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEACTIVATION */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
deleted file mode 100644
index 0da3a73801..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateCast.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateCast::get_name() const
-{
-    const size_t src_size = data_size_from_type(_src->data_type());
-    const size_t dst_size = data_size_from_type(_dst->data_type());
-
-    return (src_size >= dst_size) ? "cast_down" : "cast_up";
-}
-
-std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    const std::string kernel_name = get_name();
-    const auto        is_root     = (comp_group.get_root_component()->id() == this->id());
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
-)_";
-
-    if (is_root)
-    {
-        code += R"_(
-// IN_0(src)            {{src}}
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-{
-    {{src}}_offset_first_element_in_bytes += get_global_id(2) * {{src}}_stride_z;
-
-    TILE({{DATA_TYPE_IN}}, M0, N0, {{tmp}});
-    T_LOAD({{DATA_TYPE_IN}}, M0, N0, BUFFER, {{src}}, g_ind_0, g_ind_1, 1, {{src}}_stride_y, {{tmp}});
-)_";
-    }
-
-    code += R"_(
-    LOOP_UNROLLING(int, m0, 0, 1, M0,
-    {
-)_";
-
-    if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
-    {
-        code += R"_(
-    {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
-)_";
-    }
-
-    if (kernel_name == "cast_down" &&
-        (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
-    {
-        code += R"_(
-    {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
-)_";
-    }
-    else
-    {
-        code += R"_(
-    {{dst}}[m0].v = CONVERT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
-)_";
-    }
-
-    code += R"_(
-    })
-)_";
-
-    if (is_root)
-    {
-        code += R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-    })
-}
-)_";
-    }
-
-    code += R"_(
-//------------------ END KERNEL {{meta_kernel_id}} CAST ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    const auto is_root = (comp_group.get_root_component()->id() == this->id());
-
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-    lut["tmp"] = (is_root) ? lut["src"].value + "_in_data" : lut["src"];
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    lut["DATA_TYPE_IN"]  = get_cl_type_from_data_type(_src->data_type());
-    lut["DATA_TYPE_OUT"] = get_cl_type_from_data_type(_dst->data_type());
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateCast::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-
-    // Set build options
-    CLBuildOptions build_opts{};
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_src->dimension(0) % n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-
-    return build_opts;
-}
-
-std::string ClTemplateCast::get_config_id() const
-{
-    std::string config_id{};
-
-    config_id += "_";
-    config_id += lower_string(string_from_data_type(_src->data_type()));
-    config_id += "_";
-    config_id += lower_string(string_from_data_type(_dst->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(1));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateCast::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateCast::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    const unsigned int n0  = adjust_vec_size(16 / _dst->element_size(), _dst->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h
deleted file mode 100644
index 3adca4edc9..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATECAST
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATECAST
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateCast final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentCast::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateCast(const ClTemplateCast &cast) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateCast &operator=(const ClTemplateCast &cast) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateCast(ClTemplateCast &&cast) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateCast &operator=(ClTemplateCast &&cast) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATECAST */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
deleted file mode 100644
index 8380620ab2..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ /dev/null
@@ -1,364 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateDepthwiseConv2d.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId                      id,
-                                                     const ArgumentPack<ITensorInfo> &tensors,
-                                                     const Attributes                &attributes,
-                                                     const Settings                  &settings)
-    : IGpuTemplateComponentWriter{id, tensors},
-      _src{},
-      _weight{},
-      _bias{},
-      _dst{},
-      _attributes{attributes},
-      _settings{settings}
-{
-    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
-    {
-        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
-    }
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
-}
-
-std::string ClTemplateDepthwiseConv2d::get_name() const
-{
-    return "depthwise_conv2d";
-}
-
-std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    constexpr int height_idx = 2; // Data Layout is NHWC
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// IN_1(wei)            {{weight}}
-)_";
-
-    if (_bias != nullptr && _bias->has_valid_id())
-    {
-        code += R"_(
-// IN_1(bia)            {{bias}}
-)_";
-    }
-
-    code += R"_(
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-
-{
-#define _IWEI_WIDTH {{WEI_WIDTH}}
-#define _IWEI_HEIGHT {{WEI_HEIGHT}}
-#define _IDST_WIDTH {{arg_dst}}_w
-#define _IDST_HEIGHT {{arg_dst}}_h
-#define _IM0_A M0_A
-#define _IN0_A N0_A
-#define _IM0_B _IWEI_WIDTH
-#define _IN0_B N0
-#define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1)))
-)_";
-
-    code += R"_(
-    const int yo = g_ind_2 % {{arg_dst}}_h;
-    const int bout = g_ind_2 / {{arg_dst}}_h;
-)_";
-
-    code += R"_(
-
-    int xi = g_ind_1 * {{STRIDE_X}};
-    int yi = yo * {{STRIDE_Y}};
-    xi -= {{PAD_LEFT}};
-    yi -= {{PAD_TOP}};
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        {{dst}}[i].v = 0;
-    })
-)_";
-
-    if (_weight->dimension(height_idx) < 5)
-    {
-        code += R"_(
-    LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-)_";
-    }
-    else
-    {
-        code += R"_(
-    for(int yk = 0; yk < _IWEI_HEIGHT; ++yk)
-)_";
-    }
-
-    code += R"_(
-    {
-        TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a);
-
-        LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-        {
-            a[i].v = 0;
-        })
-
-        T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a);
-
-        TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b);
-
-        T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b);
-
-        LOOP_UNROLLING(int, m0, 0, 1, M0,
-        {
-            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-            {
-)_";
-
-    if (!_settings.is_fma_available())
-    {
-        code += R"_(
-                {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
-)_";
-    }
-    else
-    {
-        code += R"_(
-                {{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v);
-)_";
-    }
-
-    code += R"_(
-            })
-        })
-    }
-)_";
-
-    if (_weight->dimension(height_idx) < 5)
-    {
-        code += R"_(
-    )
-)_";
-    }
-
-    if (_bias && _bias->has_valid_id())
-    {
-        code += R"_(
-        TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
-
-        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}});
-
-        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}});
-)_";
-    }
-
-    code += R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-    })
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable,
-                                                  const ComponentGroup   &comp_group) const
-{
-    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image()
-                                                       ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
-                                                       : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image()
-                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
-                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-
-    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
-
-    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
-    {
-        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
-    }
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
-                                              const ComponentGroup         &comp_group) const
-{
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"]    = vtable.get_variable(_src);
-    lut["weight"] = vtable.get_variable(_weight);
-
-    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
-    {
-        lut["bias"]          = vtable.get_variable(_bias);
-        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
-    }
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["ACC_DATA_TYPE"]  = _src->data_type();
-    lut["SRC_DATA_TYPE"]  = _src->data_type();
-    lut["WEI_DATA_TYPE"]  = _weight->data_type();
-
-    switch (vtable.get_variable(_src).kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-            lut["SRC_TENSOR_TYPE"] = "IMAGE";
-            break;
-        default:
-            lut["SRC_TENSOR_TYPE"] = "BUFFER";
-            break;
-    }
-
-    switch (vtable.get_variable(_weight).kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-            lut["WEI_TENSOR_TYPE"] = "IMAGE";
-            break;
-        default:
-            lut["WEI_TENSOR_TYPE"] = "BUFFER";
-            break;
-    }
-
-    // Data Layout is NHWC
-    constexpr int width_idx  = 1;
-    constexpr int height_idx = 2;
-
-    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
-
-    lut["STRIDE_X"] = _attributes.stride().x();
-    lut["STRIDE_Y"] = _attributes.stride().y();
-
-    lut["PAD_LEFT"] = _attributes.pad().left;
-    lut["PAD_TOP"]  = _attributes.pad().top;
-
-    lut["DILATION_X"] = _attributes.dilation().x();
-    lut["DILATION_Y"] = _attributes.dilation().y();
-
-    lut["DEPTH_MULTIPLIER"] = _attributes.depth_multiplier();
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    constexpr unsigned int width_idx = 1; // Data Layout is NHWC
-
-    const unsigned int n0               = _settings.n0();
-    const unsigned int m0               = _settings.m0();
-    const unsigned int m0_a             = _weight->dimension(width_idx) + m0 - 1;
-    const unsigned int n0_a             = _attributes.depth_multiplier() > 1 ? 1 : n0;
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts{};
-
-    if (_settings.fast_relaxed_math())
-    {
-        build_opts.add_option("-cl-fast-relaxed-math");
-    }
-    else
-    {
-        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
-        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
-        build_opts.add_option("-cl-unsafe-math-optimizations");
-    }
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DN0_A=" + support::cpp11::to_string(n0_a));
-    build_opts.add_option("-DM0_A=" + support::cpp11::to_string(m0_a));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateDepthwiseConv2d::get_config_id() const
-{
-    std::string config_id{};
-
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(2));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(2));
-    config_id += "_";
-    config_id += string_from_data_type(_src->data_type());
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateDepthwiseConv2d::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    Window win = calculate_max_window(*_dst, Steps(_settings.n0(), _settings.m0()));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
deleted file mode 100644
index 5d04c687c3..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
-
-#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateDepthwiseConv2d final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentDepthwiseConv2d::Attributes;
-    using Settings   = ClComponentDepthwiseConv2d::Settings;
-    /** Constructor
-     *
-     * Similar to @ref ClComponentDepthwiseConv2d::validate()
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     * @param[in] settings   Component settings
-     */
-    ClTemplateDepthwiseConv2d(ComponentId                      id,
-                              const ArgumentPack<ITensorInfo> &tensors,
-                              const Attributes                &attributes,
-                              const Settings                  &settings);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateDepthwiseConv2d(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateDepthwiseConv2d &operator=(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateDepthwiseConv2d(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateDepthwiseConv2d &operator=(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_weight;
-    const ITensorInfo *_bias;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-    Settings           _settings;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
deleted file mode 100644
index f6a7a58d1d..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateDirectConv2d.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
-                                               const ArgumentPack<ITensorInfo> &tensors,
-                                               const Attributes                &attributes,
-                                               const Settings                  &settings)
-    : IGpuTemplateComponentWriter{id, tensors},
-      _src{},
-      _weight{},
-      _bias{},
-      _dst{},
-      _attributes{attributes},
-      _settings{settings}
-{
-    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
-    {
-        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
-    }
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
-}
-
-std::string ClTemplateDirectConv2d::get_name() const
-{
-    return "direct_conv2d";
-}
-
-std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
-    const auto k0            = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
-    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// IN_1(wei)            {{weight}}
-)_";
-    if (_bias && _bias->has_valid_id())
-    {
-        code += R"_(
-// IN_1(bia)            {{bias}}
-)_";
-    }
-    code += R"_(
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-
-{
-#define _IWEI_WIDTH {{WEI_WIDTH}}
-#define _IWEI_HEIGHT {{WEI_HEIGHT}}
-#define _ISRC_WIDTH {{SRC_WIDTH}}
-#define _ISRC_HEIGHT {{SRC_HEIGHT}}
-#define _ISRC_CHANNELS {{SRC_CHANNELS}}
-#define _IDST_WIDTH {{DST_WIDTH}}
-#define _IDST_HEIGHT {{DST_HEIGHT}}
-#define _IDST_CHANNELS {{DST_CHANNELS}}
-#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
-
-    TILE(int, M0, 1, xi);
-    TILE(int, M0, 1, yi);
-
-    // Convert the linear index to coordinate
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        xi[0].s[i] = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
-        yi[0].s[i] = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
-        xi[0].s[i] -= {{PAD_LEFT}};
-        yi[0].s[i] -= {{PAD_TOP}};
-    })
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        {{dst}}[i].v = 0;
-    })
-
-    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
-    {
-        int xk = i % _IWEI_WIDTH;
-        int yk = i / _IWEI_WIDTH;
-
-        TILE(int, 1, M0, my);
-
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            int x_s    = xi[0].s[i] + xk;
-            int y_s    = yi[0].s[i] + yk;
-            my[0].s[i] = x_s + y_s *_ISRC_WIDTH;
-            my[0].s[i] = my[0].s[i] + g_ind_2 * (int)(_ISRC_WIDTH * _ISRC_HEIGHT);
-            my[0].s[i] = select(-1, my[0].s[i], x_s >= 0);
-            my[0].s[i] = select(-1, my[0].s[i], x_s < _ISRC_WIDTH);
-            my[0].s[i] = select(-1, my[0].s[i], y_s >= 0);
-            my[0].s[i] = select(-1, my[0].s[i], y_s < _ISRC_HEIGHT);
-        })
-
-        int ck = 0;
-        for(; ck <= (_ISRC_CHANNELS - K0); ck += K0)
-        {
-            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
-            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = {{ZERO_VALUE}};
-            })
-
-            LOOP_UNROLLING(int, i, 0, 1, N0,
-            {
-                b[i].v = {{ZERO_VALUE}};
-            })
-
-            T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
-
-            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
-
-            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
-        }
-)_";
-
-    if (leftover_loop)
-    {
-        code += R"_(
-        for(; ck < _ISRC_CHANNELS; ++ck)
-        {
-            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
-            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = {{ZERO_VALUE}};
-            })
-
-            LOOP_UNROLLING(int, i, 0, 1, N0,
-            {
-                b[i].v = {{ZERO_VALUE}};
-            })
-
-            T_LOAD2D_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, ck, {{src}}_stride_y, my, a);
-
-            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
-
-            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
-        }
-    )_";
-    }
-
-    code += R"_(
-#undef _I_WEI_WIDTH
-#undef _I_WEI_HEIGHT
-#undef _ISRC_WIDTH
-#undef _ISRC_HEIGHT
-#undef _ISRC_CHANNELS
-#undef _IDST_WIDTH
-#undef _IDST_HEIGHT
-#undef _IDST_CHANNELS
-#undef _IY_MULTIPLIER
-
-    }
-)_";
-
-    if (_bias && _bias->has_valid_id())
-    {
-        code += R"_(
-        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
-
-        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
-
-        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
-    )_";
-    }
-
-    code += R"_(
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
-        g_dst_indirect_y[i].v += g_ind_2 * (int)({{DST_WIDTH}} * {{DST_HEIGHT}});
-    })
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    return code;
-}
-
-void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image()
-                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
-                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
-
-    if (_bias && _bias->has_valid_id()) // optional bias
-    {
-        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
-    }
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
-}
-
-TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    TagLUT lut{};
-    // Arguments and global shared variables
-    lut["src"]    = vtable.get_variable(_src);
-    lut["weight"] = vtable.get_variable(_weight);
-
-    if (_bias && _bias->has_valid_id()) // optional bias
-    {
-        lut["bias"]          = vtable.get_variable(_bias);
-        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
-    }
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["ACC_DATA_TYPE"]  = _src->data_type();
-    lut["SRC_DATA_TYPE"]  = _src->data_type();
-    lut["WEI_DATA_TYPE"]  = _weight->data_type();
-
-    lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    switch (vtable.get_variable(_weight).kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-        {
-            lut["WEI_TENSOR_TYPE"] = "IMAGE";
-            break;
-        }
-        default:
-        {
-            lut["WEI_TENSOR_TYPE"] = "BUFFER";
-            break;
-        }
-    }
-    const auto width_idx   = 1;
-    const auto height_idx  = 2;
-    const auto channel_idx = 0;
-
-    lut["SRC_WIDTH"]    = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"]   = _src->dimension(height_idx);
-    lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
-
-    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
-
-    lut["DST_WIDTH"]    = _dst->dimension(width_idx);
-    lut["DST_HEIGHT"]   = _dst->dimension(height_idx);
-    lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
-
-    lut["STRIDE_X"] = _attributes.stride().x();
-    lut["STRIDE_Y"] = _attributes.stride().y();
-
-    lut["PAD_LEFT"] = _attributes.pad().left;
-    lut["PAD_TOP"]  = _attributes.pad().top;
-
-    lut["ZERO_VALUE"] = 0;
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
-{
-    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
-
-    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-    const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts{};
-    if (_settings.fast_relaxed_math())
-    {
-        build_opts.add_option("-cl-fast-relaxed-math");
-    }
-    else
-    {
-        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
-        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
-        build_opts.add_option("-cl-unsafe-math-optimizations");
-    }
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateDirectConv2d::get_config_id() const
-{
-    const DataType   data_type   = _src->data_type();
-    const DataLayout data_layout = _src->data_layout();
-
-    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    const unsigned int kernel_size = _weight->dimension(width_idx);
-
-    std::string config_id{};
-    config_id += lower_string(string_from_data_type(data_type));
-    config_id += "_";
-    config_id += support::cpp11::to_string(kernel_size);
-    config_id += "_";
-    config_id += support::cpp11::to_string(_attributes.stride().x());
-    config_id += "_";
-    config_id += support::cpp11::to_string(_attributes.stride().y());
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
-    config_id += "_";
-    config_id += lower_string(string_from_data_layout(data_layout));
-    return config_id;
-}
-
-std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateDirectConv2d::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    const auto output_shape = _dst->tensor_shape();
-    const auto desc         = _settings.direct_conv_descriptor();
-
-    const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
-    const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1] * output_shape[2]);
-
-    // Create and configure kernel window
-    Window win = calculate_max_window(output_shape, Steps(n0, m0));
-
-    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], m0);
-    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
-    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
-
-    return win;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
deleted file mode 100644
index 03c8cd2f15..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
-
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentDirectConv2d::Attributes;
-    using Settings   = ClComponentDirectConv2d::Settings;
-    /** Constructor
-     *
-     * Similar to @ref ClComponentDirectConv2d::validate()
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     * @param[in] settings   Component settings
-     */
-    ClTemplateDirectConv2d(ComponentId                      id,
-                           const ArgumentPack<ITensorInfo> &tensors,
-                           const Attributes                &attributes,
-                           const Settings                  &settings);
-    /** Destructor */
-    ~ClTemplateDirectConv2d() override = default;
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_weight;
-    const ITensorInfo *_bias;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-    Settings           _settings;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
deleted file mode 100644
index 78bff3c3f3..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateElementwiseBinary.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-constexpr unsigned int vector_size_byte_opencl = 16;
-
-ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId                      id,
-                                                         const ArgumentPack<ITensorInfo> &tensors,
-                                                         const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
-{
-    _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
-}
-
-std::string ClTemplateElementwiseBinary::get_name() const
-{
-    return "elementwise_binary";
-}
-
-std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const
-{
-    std::string code;
-    const bool  is_root      = (comp_group.get_root_component()->id() == this->id());
-    const bool  is_lhs_input = comp_group.is_input_tensor(_lhs);
-    const bool  is_rhs_input = comp_group.is_input_tensor(_rhs);
-
-    code =
-        R"_(
-    //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
-)_";
-
-    if (is_root)
-    {
-        code +=
-            R"_(
-    TILE(uint, M0, 1, g_dst_indirect_y);
-)_";
-    }
-
-    if (is_lhs_input)
-    {
-        code +=
-            R"_(
-    TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}});
-)_";
-    }
-
-    if (is_rhs_input)
-    {
-        code +=
-            R"_(
-    TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}});
-)_";
-    }
-
-    code +=
-        R"_(
-    {
-)_";
-
-    if (is_lhs_input)
-    {
-        code +=
-            R"_(
-        {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w;
-        T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
-)_";
-    }
-
-    if (is_rhs_input)
-    {
-        code +=
-            R"_(
-        {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w;
-        T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
-)_";
-    }
-
-    code +=
-        R"_(
-        T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
-)_";
-
-    if (is_root)
-    {
-        // Calculate the destination indirect Y
-        code +=
-            R"_(
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
-            g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-        })
-)_";
-    }
-
-    code +=
-        R"_(
-    }
-    //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable,
-                                                    const ComponentGroup   &comp_group) const
-{
-    vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs");
-
-    vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
-}
-
-TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable,
-                                                const ComponentGroup         &comp_group) const
-{
-    TagLUT lut{};
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-    lut["DATA_TYPE"]      = get_cl_type_from_data_type(_lhs->data_type());
-    // Arguments and global shared variables
-
-    lut["lhs"]     = vtable.get_variable(_lhs);
-    lut["rhs"]     = vtable.get_variable(_rhs);
-    lut["dst"]     = vtable.get_variable(_dst);
-    lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
-
-    switch (_attributes.operation())
-    {
-        case Attributes::ElementwiseOp::Add:
-            lut["ELTWISE_OP"] = "ADD";
-            break;
-        case Attributes::ElementwiseOp::Sub:
-            lut["ELTWISE_OP"] = "SUB";
-            break;
-        case Attributes::ElementwiseOp::Mul:
-            lut["ELTWISE_OP"] = "MUL";
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
-    }
-
-    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) &&
-                         detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
-    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) &&
-                         detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
-
-    // Set broadcast parameters
-    // PRE: All tensors are broadcast-compatible
-    const auto &lhs_dims = _lhs->tensor_shape();
-    const auto &rhs_dims = _rhs->tensor_shape();
-    const auto &dst_dims = _dst->tensor_shape();
-
-    const auto lhs_broadcast_x = dst_dims[0] != 1 && lhs_dims[0] == 1;
-    const auto rhs_broadcast_x = dst_dims[0] != 1 && rhs_dims[0] == 1;
-    const auto lhs_broadcast_y = dst_dims[1] != 1 && lhs_dims[1] == 1;
-    const auto rhs_broadcast_y = dst_dims[1] != 1 && rhs_dims[1] == 1;
-    const auto lhs_broadcast_z = dst_dims[2] != 1 && lhs_dims[2] == 1;
-    const auto rhs_broadcast_z = dst_dims[2] != 1 && rhs_dims[2] == 1;
-
-    const auto lhs_broadcast_yz = lhs_broadcast_y && lhs_broadcast_z;
-    const auto rhs_broadcast_yz = rhs_broadcast_y && rhs_broadcast_z;
-
-    lut["lhs_n0"]          = (lhs_broadcast_x) ? "1" : "N0";
-    lut["lhs_start_ind_0"] = (lhs_broadcast_x) ? "0" : "g_ind_0";
-    lut["rhs_n0"]          = (rhs_broadcast_x) ? "1" : "N0";
-    lut["rhs_start_ind_0"] = (rhs_broadcast_x) ? "0" : "g_ind_0";
-
-    lut["lhs_m0"]          = (lhs_broadcast_yz) ? "1" : "M0";
-    lut["lhs_start_ind_1"] = (lhs_broadcast_yz) ? "0" : "g_ind_1";
-    lut["rhs_m0"]          = (rhs_broadcast_yz) ? "1" : "M0";
-    lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
-
-    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : "";
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateElementwiseBinary::get_build_options(const ComponentGroup &comp_group) const
-{
-    CLBuildOptions build_opts{};
-    /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_lhs->data_type()));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateElementwiseBinary::get_config_id() const
-{
-    std::string config_id{};
-    config_id += lower_string(string_from_data_type(_dst->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-    config_id += "_";
-    config_id += lower_string(string_from_data_layout(_dst->data_layout()));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateElementwiseBinary::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    TensorShape output_shape = _dst->tensor_shape();
-    // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
-    // This is in line with the collapsing convention used by operators like Conv2d
-    output_shape.collapse(2U, 1U);
-    const unsigned int num_elems_processed_per_iteration =
-        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
-
-    return win;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
deleted file mode 100644
index 991c0eca44..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateElementwiseBinary final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentElementwiseBinary::Attributes;
-
-    /** Constructor
-     *
-     * Similar to @ref ClComponentElementwiseBinary::validate()
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateElementwiseBinary &operator=(const ClTemplateElementwiseBinary &elementwise) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateElementwiseBinary(ClTemplateElementwiseBinary &&elementwise) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateElementwiseBinary &operator=(ClTemplateElementwiseBinary &&elementwise) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_lhs;
-    const ITensorInfo *_rhs;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
deleted file mode 100644
index 522c33a022..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-constexpr unsigned int serial_vector_size = 8;
-} // namespace
-ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
-                                                                   const ArgumentPack<ITensorInfo> &tensors,
-                                                                   const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_1);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_sum);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_dst);
-}
-
-std::string ClTemplateLogits1DMaxShiftExpSum::get_name() const
-{
-    return "logits_1d_max_shift_exp_sum";
-}
-
-std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-#define VEC_TYPE VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-{
-    __global uchar *src_addr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + g_ind_1 * {{src}}_stride_y + g_ind_2 * {{src}}_stride_z;
-    __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + g_ind_1 * {{dst}}_stride_y + g_ind_2 * {{dst}}_stride_z;
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT({{sum}});
-    VEC_TYPE max_val_vec = (VEC_TYPE)({{MINVAL}});
-)_";
-
-    const bool beta_defined = (_attributes.beta() != 1.f);
-
-    if (beta_defined)
-    {
-        code += R"_(
-    VEC_TYPE beta = (VEC_TYPE){{BETA}};
-)_";
-    }
-
-    constexpr unsigned int _serial_vector_size = 8;
-    const unsigned int     reduction_dim_size  = _src->dimension(0);
-    const unsigned int     vector_size         = adjust_vec_size(_serial_vector_size, reduction_dim_size);
-    const bool             non_multiple_of_n0  = ((reduction_dim_size % vector_size) != 0);
-
-    if (non_multiple_of_n0)
-    {
-        code += R"_(
-    VEC_TYPE data    = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)PARTIAL_N0 > VEC_OFFS(SELECT_DATA_TYPE({{DATA_TYPE}}), N0);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)({{MINVAL}}), data, widx));
-)_";
-    }
-
-    code += R"_(
-    for(uint i = PARTIAL_N0; i < {{SRC_WIDTH}}; i += N0)
-    {
-        VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(src_addr + i * sizeof({{DATA_TYPE}})));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    {{DATA_TYPE}} max_val = MAX_REDUCE(max_val_vec, N0);
-    VEC_TYPE sum1D = 0;
-)_";
-
-    if (non_multiple_of_n0)
-    {
-        code += R"_(
-    data -= max_val;
-)_";
-        if (beta_defined)
-        {
-            code += R"_(
-    data *= beta;
-)_";
-        }
-
-        if (_attributes.is_log_softmax())
-        {
-            code += R"_(
-    VSTORE_PARTIAL(N0, PARTIAL_N0)
-    (data, 0, (__global {{DATA_TYPE}} *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-)_";
-        }
-        else
-        {
-            code += R"_(
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(N0, PARTIAL_N0)
-    (data, 0, (__global {{DATA_TYPE}} *)dst_addr);
-)_";
-        }
-
-        code += R"_(
-    sum1D += data;
-)_";
-    }
-    code += R"_(
-    for(uint i = PARTIAL_N0; i < {{SRC_WIDTH}}; i += N0)
-    {
-        VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(src_addr + i * sizeof({{DATA_TYPE}})));
-        data -= max_val;
-)_";
-
-    if (beta_defined)
-    {
-        code += R"_(
-    data *= beta;
-)_";
-    }
-
-    if (_attributes.is_log_softmax())
-    {
-        code += R"_(
-    VSTORE(N0)
-    (data, 0, (__global {{DATA_TYPE}} *)(dst_addr + i * sizeof({{DATA_TYPE}})));
-    data = exp(data);
-)_";
-    }
-    else
-    {
-        code += R"_(
-    data = exp(data);
-    VSTORE(N0)
-    (data, 0, (__global {{DATA_TYPE}} *)(dst_addr + i * sizeof({{DATA_TYPE}})));
-)_";
-    }
-
-    code += R"_(
-    sum1D += data;
-    }
-)_";
-
-    code += R"_(
-    *((__global {{DATA_TYPE}} *)sum.ptr) = SUM_REDUCE(sum1D, N0);
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable,
-                                                         const ComponentGroup   &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
-
-    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
-}
-
-TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable,
-                                                     const ComponentGroup         &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["sum"] = vtable.get_variable(_sum);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    const DataType data_type = _src->data_type();
-
-    lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
-    lut["BETA"]      = float_to_string_with_full_precision(_attributes.beta());
-    lut["MINVAL"]    = (data_type == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX");
-    lut["SRC_WIDTH"] = support::cpp11::to_string(_src->dimension(0));
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    CLBuildOptions build_opts{};
-
-    const unsigned int reduction_dim_size = _src->dimension(0);
-    const unsigned int vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size)));
-
-    return build_opts;
-}
-
-std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const
-{
-    std::string config_id = get_name();
-
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += string_from_data_type(_src->data_type());
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateLogits1DMaxShiftExpSum::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    Window win = calculate_max_window(*_dst, Steps(_src->dimension(0)));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
deleted file mode 100644
index ac9ddaa9d4..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DMAXSHIFTEXPSUM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DMAXSHIFTEXPSUM
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateLogits1DMaxShiftExpSum final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentLogits1DMaxShiftExpSum::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
-                                     const ArgumentPack<ITensorInfo> &tensors,
-                                     const Attributes                &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateLogits1DMaxShiftExpSum &operator=(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateLogits1DMaxShiftExpSum(ClTemplateLogits1DMaxShiftExpSum &&) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateLogits1DMaxShiftExpSum &operator=(ClTemplateLogits1DMaxShiftExpSum &&) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src; // input
-    const ITensorInfo *_sum; // exponentiated and summed input
-    const ITensorInfo *_dst; // exponentiated input
-    Attributes         _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DMAXSHIFTEXPSUM */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
deleted file mode 100644
index 7d7c3e6673..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId                      id,
-                                               const ArgumentPack<ITensorInfo> &tensors,
-                                               const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_sum);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_dst);
-}
-
-std::string ClTemplateLogits1DNorm::get_name() const
-{
-    return "logits_1d_norm";
-}
-
-std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-{
-    const int x_offs = g_ind_0 * sizeof({{DATA_TYPE}});
-    __global uchar *src_addr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + x_offs + g_ind_1 * {{src}}_stride_y + g_ind_2 * {{src}}_stride_z;
-    __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + x_offs + g_ind_1 * {{dst}}_stride_y + g_ind_2 * {{dst}}_stride_z;
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP({{sum}});
-)_";
-    // Load max value of 1D logits vector (row)
-    code += R"_(
-    {{DATA_TYPE}} sum_val = *((__global {{DATA_TYPE}} *)offset(&sum, 0, g_ind_1));
-    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-    data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
-)_";
-
-    if (_attributes.is_log_softmax())
-    {
-        code += R"_(
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-)_";
-    }
-    else
-    {
-        code += R"_(
-    data0 /= sum_val;
-)_";
-    }
-
-    code += R"_(
-    STORE_VECTOR_SELECT(data, {{DATA_TYPE}}, dst_addr, N0, PARTIAL_N0, PARTIAL_N0 != 0 && g_ind_0 == 0);
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
-
-    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
-}
-
-TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["sum"] = vtable.get_variable(_sum);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    const DataType data_type = _src->data_type();
-
-    lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateLogits1DNorm::get_build_options(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    CLBuildOptions build_opts{};
-
-    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((_src->dimension(0) % n0)));
-
-    return build_opts;
-}
-
-std::string ClTemplateLogits1DNorm::get_config_id() const
-{
-    std::string config_id = get_name();
-
-    config_id += "_";
-    config_id += support::cpp11::to_string(_src->dimension(0));
-    config_id += "_";
-    config_id += string_from_data_type(_src->data_type());
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateLogits1DNorm::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateLogits1DNorm::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    constexpr unsigned int serial_vector_size = 16;
-    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, _src->dimension(0));
-
-    Window win = calculate_max_window(*_src, Steps(vector_size));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h
deleted file mode 100644
index 5a74be5842..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DNORM
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DNORM
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateLogits1DNorm final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentLogits1DNorm::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateLogits1DNorm(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateLogits1DNorm(const ClTemplateLogits1DNorm &) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateLogits1DNorm &operator=(const ClTemplateLogits1DNorm &) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateLogits1DNorm(ClTemplateLogits1DNorm &&) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateLogits1DNorm &operator=(ClTemplateLogits1DNorm &&) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src; // exponentiated input
-    const ITensorInfo *_sum; // exponentiated and summed input
-    const ITensorInfo *_dst; // normalization of input with _sum
-
-    Attributes _attributes;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATELOGITS1DNORM */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
deleted file mode 100644
index 8936db6abe..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright (c) 2023-2024 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplatePool2d.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-// Shape indexes for NHWC Datalayout
-constexpr static int32_t height_idx  = 2;
-constexpr static int32_t width_idx   = 1;
-constexpr static int32_t channel_idx = 0;
-} // namespace
-ClTemplatePool2d::ClTemplatePool2d(ComponentId                      id,
-                                   const ArgumentPack<ITensorInfo> &tensors,
-                                   const Attributes                &attributes,
-                                   const Settings                  &settings)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplatePool2d::get_name() const
-{
-    return "pool2d";
-}
-
-std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    // Condition to use 2x2 optimized kernel
-    if (_attributes.pool_size() == Size2D(2, 2))
-    {
-        return get_2x2_kernel_code();
-    }
-    else
-    {
-        return get_MxN_kernel_code();
-    }
-}
-
-std::string ClTemplatePool2d::get_MxN_kernel_code() const
-{
-    const auto pool_type          = _attributes.pool_type();
-    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
-
-    // Define pool op macro.
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
-                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
-
-    // Kernel start
-    // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than N0, N0 should be SHRINKED to the closest smaller N0. This operation is performed on the host side
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// OUT(dst, accum)      {{dst}}
-
-{
-    const int idx_out_c = g_ind_0;
-    const int idx_out_w = g_ind_1;
-)_";
-
-    // Add macro for POOL_OP
-    code += "\n" + pool_op + "\n";
-
-    code += R"_(
-    const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
-    const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
-)_";
-
-    // Define common variables.
-    code += R"_(
-    __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
-
-    __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n * {{dst}}_stride_w;
-
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-    res0 = {{INITIAL_VALUE}};
-
-    const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
-    const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
-
-    const int pool_x_s = max((int)0, -idx_in_w);
-    const int pool_x_e = min((int){{POOL_SIZE_X}}, (int){{SRC_WIDTH}} - idx_in_w);
-    const int pool_y_s = max((int)0, -idx_in_h);
-    const int pool_y_e = min((int){{POOL_SIZE_Y}}, (int){{SRC_HEIGHT}} - idx_in_h);
-)_";
-
-    // Determine filter size depending on if padding is excluded or not
-    if (_attributes.exclude_padding())
-    {
-        code += R"_(
-    const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
-)_";
-    }
-    else
-    {
-        code += R"_(
-    const int filter_size = {{POOL_SIZE_X}} * {{POOL_SIZE_Y}};
-)_";
-    }
-
-    // Loop through pool size
-    // if global pooling
-    if (_attributes.pool_size().x() == _src->dimension(width_idx) &&
-        _attributes.pool_size().y() == _src->dimension(height_idx))
-    {
-        // Begin loop
-        code += R"_(
-    // Global pooling path
-    for(int y = 0; y < {{POOL_SIZE_Y}}; ++y)
-    {
-    #pragma unroll 8
-        for(int x = 0; x < {{POOL_SIZE_X}}; ++x)
-        {
-            VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-            data0;
-)_";
-    }
-    else // if local pooling size
-    {
-        code += R"_(
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-    #pragma unroll 8
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-            VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-            data0;
-)_";
-    } // end else
-
-    // if condition inside loop - use 32bit acc if mixed_precision.
-    // End loop through pooling section.
-    if (fp_mixed_precision)
-    {
-        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-        code += R"_(
-            data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-)_";
-    }
-    else // load data, compute result and end loop
-    {
-        code += R"_(
-            data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + (x + idx_in_w) * {{src}}_stride_y + (y + idx_in_h) * {{src}}_stride_z));
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-)_";
-    }
-
-    // For Pool AVG ONLY, divide pool output by filter size
-    if (pool_type == PoolingType::AVG)
-    {
-        code += R"_(
-    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
-)_";
-    }
-
-    // If mixed precision convert datatype before storing. Then end kernel.
-    if (fp_mixed_precision)
-    {
-        code += R"_(
-    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
-    STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-    else
-    {
-        // Store data
-        code += R"_(
-    STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-
-    code += R"_(
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-}
-)_";
-
-    return code;
-}
-
-std::string ClTemplatePool2d::get_2x2_kernel_code() const
-{
-    const auto  pool_type          = _attributes.pool_type();
-    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
-    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
-                                                                     : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-// IN_0(src)            {{src}}
-// OUT(dst, accum)      {{dst}}
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-
-{
-    const int idx_out_c = g_ind_0;
-    const int idx_out_w = g_ind_1;
-)_";
-
-    // Add pool op macro
-    code += "\n" + pool_op + "\n";
-
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    code += R"_(
-    const int idx_out_h = g_ind_2 % {{DST_HEIGHT}};
-    const int idx_out_n = g_ind_2 / {{DST_HEIGHT}};
-)_";
-
-    code += R"_(
-    const int idx_in_w = idx_out_w * {{STRIDE_X}} - {{PAD_X}};
-    const int idx_in_h = idx_out_h * {{STRIDE_Y}} - {{PAD_Y}};
-
-    __global unsigned char *in_base_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_n * {{src}}_stride_w;
-    __global unsigned char *out_base_ptr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + idx_out_c * sizeof({{DATA_TYPE}}) + idx_out_w * {{dst}}_stride_y + idx_out_h * {{dst}}_stride_z + idx_out_n *
-                                           {{dst}}_stride_w;
-    const int pool_x_s = max((int)0, -idx_in_w);
-    const int pool_x_e = min((int)2, (int){{SRC_WIDTH}} - idx_in_w);
-    const int pool_y_s = max((int)0, -idx_in_h);
-    const int pool_y_e = min((int)2, (int){{SRC_HEIGHT}} - idx_in_h);
-
-    const int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-    const int x0 = pool_x_s + idx_in_w;
-    const int y0 = pool_y_s + idx_in_h;
-    const int x1 = pool_x_e - 1 + idx_in_w;
-    const int y1 = pool_y_e - 1 + idx_in_h;
-
-    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
-)_";
-
-    if (fp_mixed_precision)
-    {
-        // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-        code += R"_(
-    data0 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-    data1 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-    data2 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-    data3 = CONVERT(VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z)), VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0));
-)_";
-    }
-    else
-    {
-        code += R"_(
-    data0         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y0 * {{src}}_stride_z));
-    data1         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y0 * {{src}}_stride_z));
-    data2         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x0 * {{src}}_stride_y + y1 * {{src}}_stride_z));
-    data3         = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)(in_base_ptr + x1 * {{src}}_stride_y + y1 * {{src}}_stride_z));
-)_";
-    }
-
-    if (pool_type != PoolingType::MAX)
-    {
-        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
-        code += R"_(
-    if(filter_size != 4)
-    {
-        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
-        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)({{SRC_WIDTH}} - 1);
-        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
-        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)({{SRC_HEIGHT}} - 1);
-
-        data0 = select(data0, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_s));
-        data1 = select(data1, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_s));
-        data2 = select(data2, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_s | cond_h_e));
-        data3 = select(data3, (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)){{INITIAL_VALUE}}, (SELECT_TYPE)(cond_w_e | cond_h_e));
-    }
-)_";
-    }
-
-    code += R"_(
-    VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0)
-    res0 = data0;
-    res0 = POOL_OP(res0, data1);
-    res0 = POOL_OP(res0, data2);
-    res0 = POOL_OP(res0, data3);
-)_";
-
-    if (pool_type == PoolingType::AVG)
-    {
-        // If avg pooling divide result accordingly.
-        if (_attributes.exclude_padding())
-        {
-            code += R"_(
-    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
-)_";
-        }
-        else
-        {
-            code += R"_(
-    res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))4;
-)_";
-        }
-    }
-
-    // Store result
-    if (fp_mixed_precision)
-    {
-        code += R"_(
-    VEC_DATA_TYPE({{DATA_TYPE}}, N0)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE({{DATA_TYPE}}, N0));
-    STORE_VECTOR_SELECT(res_converted, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-    else
-    {
-        code += R"_(
-    STORE_VECTOR_SELECT(res, {{DATA_TYPE}}, out_base_ptr, N0, PARTIAL_N0, (PARTIAL_N0 != 0) && g_ind_0 == 0);
-)_";
-    }
-
-    code += R"_(
-    //------------------ END KERNEL {{meta_kernel_id}} ---------------------
-}
-#undef SELECT_TYPE
-)_";
-
-    return code;
-}
-
-void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    TagLUT lut{};
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"] = id();
-
-    // Retrieve relevant data
-    const auto padding   = _attributes.pad();
-    const auto stride    = _attributes.stride();
-    const auto pool_size = _attributes.pool_size();
-    const auto data_type = _src->data_type();
-    const auto use_fp_mixed_precision =
-        (_src->data_type() == DataType::F16) && _attributes.pool_type() != PoolingType::MAX;
-    const std::string max_initial_value =
-        _settings.use_inf_as_limit() ? "(-INFINITY)"
-                                     : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
-
-    // pool specific
-    lut["STRIDE_X"]    = stride.x();
-    lut["STRIDE_Y"]    = stride.y();
-    lut["PAD_X"]       = padding.left;
-    lut["PAD_Y"]       = padding.top;
-    lut["POOL_SIZE_X"] = pool_size.width;
-    lut["POOL_SIZE_Y"] = pool_size.height;
-
-    // Datatypes and variables
-    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type(
-        (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
-    lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
-    lut["SRC_WIDTH"]     = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
-    lut["INITIAL_VALUE"] = (_attributes.pool_type() == PoolingType::MAX) ? max_initial_value : std::string("0");
-
-    // Tensor specific data
-    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
-
-    return lut;
-}
-
-CLBuildOptions ClTemplatePool2d::get_build_options(const ComponentGroup &comp_group) const
-{
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts{};
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplatePool2d::get_config_id() const
-{
-    const DataType   data_type   = _src->data_type();
-    const DataLayout data_layout = _src->data_layout();
-
-    std::string config_id{};
-    config_id += "pooling_layer_2d_";
-    config_id += lower_string(string_from_data_type(data_type));
-    config_id += "_";
-    config_id += lower_string(string_from_data_layout(data_layout));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(channel_idx));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplatePool2d::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h", "repeat.h"};
-}
-
-Window ClTemplatePool2d::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    const auto         output_shape = _dst->tensor_shape();
-    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
-
-    // Create and configure kernel window
-    auto win = calculate_max_window(output_shape, Steps(vec_size));
-    win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
-    return win;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
deleted file mode 100644
index d1d3c01669..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D
-
-#include "arm_compute/core/experimental/Types.h"
-#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplatePool2d final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentPool2d::Attributes;
-    using Settings   = ClComponentPool2d::Settings;
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     * @param[in] settings   Component settings
-     */
-    ClTemplatePool2d(ComponentId                      id,
-                     const ArgumentPack<ITensorInfo> &tensors,
-                     const Attributes                &attributes,
-                     const Settings                  &settings);
-
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplatePool2d(const ClTemplatePool2d &direct_conv2d) = delete;
-
-    /** Prevent instances of this class from being copied */
-    ClTemplatePool2d &operator=(const ClTemplatePool2d &direct_conv2d) = delete;
-
-    /** Allow instances of this class to be move constructed */
-    ClTemplatePool2d(ClTemplatePool2d &&direct_conv2d) = default;
-
-    /** Allow instances of this class to be moved */
-    ClTemplatePool2d &operator=(ClTemplatePool2d &&direct_conv2d) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    /** Generate pooling kernel template code optimized for 2x2 pooling
-     *
-     * @return std::String Component code
-     */
-    std::string get_2x2_kernel_code() const;
-
-    /** Generate generalised pooling kernel template code for MxN pooling
-     *
-     * @return std::String Component code
-     */
-    std::string get_MxN_kernel_code() const;
-
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-    Settings           _settings;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEPOOL2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
deleted file mode 100644
index c882353fcb..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateReshape.h"
-
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-constexpr unsigned int vector_size_byte_opencl = 16;
-
-ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateReshape::get_name() const
-{
-    return "reshape";
-}
-
-std::string ClTemplateReshape::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    std::string code;
-
-    code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-
-// IN(src)              {{src}}
-// OUT(dst, accum)      {{dst}}
-
-TILE(uint, M0, 1, g_dst_indirect_y);
-{
-    __global uchar * base_src_ptr = {{src}}_ptr + {{src}}_offset_first_element_in_bytes;
-    const int tile_vertical_idx = g_ind_1 * {{arg_dst}}_c + g_ind_2 * {{arg_dst}}_c * {{arg_dst}}_w;
-    LOOP_UNROLLING(int, _m0, 0, 1, M0,
-    {
-        const int row_idx = _m0 * {{arg_dst}}_c + tile_vertical_idx;
-        const int tile_horizontal_idx = g_ind_0 + row_idx;
-        LOOP_UNROLLING(int, _n0, 0, 1, N0,
-        {
-            {{src}}_ptr = base_src_ptr;
-            const int linear_idx = tile_horizontal_idx + _n0;
-            const int in_id_x = linear_idx % {{src}}_c;
-            const int in_id_y = (linear_idx / {{src}}_c) % {{src}}_w;
-            const int in_id_z = linear_idx / ({{src}}_c * {{src}}_w);
-            {{src}}_ptr += in_id_x * sizeof({{DATA_TYPE}}) + in_id_y * {{src}}_stride_y + in_id_z * {{src}}_stride_z;
-            {{dst}}[_m0].s[_n0] = *((__global {{DATA_TYPE}} *){{src}}_ptr);
-        })
-    })
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w);
-        g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-    })
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-    return code;
-}
-
-void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src,
-                            GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
-}
-
-TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"]            = vtable.get_variable(_src);
-    lut["dst"]            = vtable.get_variable(_dst);
-    lut["arg_dst"]        = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["meta_kernel_id"] = id();
-    lut["DATA_TYPE"]      = get_cl_type_from_data_type(_dst->data_type());
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateReshape::get_build_options(const ComponentGroup &comp_group) const
-{
-    CLBuildOptions     build_opts{};
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateReshape::get_config_id() const
-{
-    std::string config_id{};
-    config_id += lower_string(string_from_data_type(_dst->data_type()));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateReshape::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateReshape::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-    const unsigned int n0  = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
deleted file mode 100644
index 838a21db6d..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateReshape final : public IGpuTemplateComponentWriter
-{
-public:
-    /** Constructor
-     *
-     * @param[in] id      Component id
-     * @param[in] tensors Tensor arguments to the components
-     */
-    ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateReshape(const ClTemplateReshape &reshape) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateReshape &operator=(const ClTemplateReshape &reshape) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateReshape(ClTemplateReshape &&reshape) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateReshape &operator=(ClTemplateReshape &&reshape) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
deleted file mode 100644
index 846c712ceb..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ClTemplateResize.h"
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
-
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateResize::ClTemplateResize(ComponentId                         id,
-                                   const ArgumentPack<ITensorInfo>    &tensors,
-                                   const ClTemplateResize::Attributes &attributes)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-}
-
-std::string ClTemplateResize::get_name() const
-{
-    return _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "resize_bilinear" : "resize_nearest";
-}
-
-std::string ClTemplateResize::get_component_code(const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    std::string code = R"_(
-//------------------ START KERNEL {{meta_kernel_id}} ---------------------
-TILE(uint, 1, 1, g_dst_indirect_y);
-{
-    const int yo = g_ind_2 % {{arg_dst}}_h;
-    const int bout = g_ind_2 / {{arg_dst}}_h;
-)_";
-
-    if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
-    {
-        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
-        {
-            code += R"_(
-    float xi_f = (g_ind_1 * {{SCALE_X}});
-    float yi_f = (yo * {{SCALE_Y}});
-)_";
-        }
-        else
-        {
-            code += R"_(
-    float xi_f = ((g_ind_1 + 0.5f) * {{SCALE_X}});
-    float yi_f = ((yo + 0.5f) * {{SCALE_Y}});
-)_";
-        }
-
-        if (_attributes.align_corners())
-        {
-            code += R"_(
-    xi_f = round(xi_f);
-    yi_f = round(yi_f);
-)_";
-        }
-
-        code += R"_(
-    const int xi0 = clamp((int)xi_f, 0, (int){{src}}_w - 1);
-    const int yi0 = clamp((int)yi_f, 0, (int){{src}}_h - 1);
-
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}});
-)_";
-    }
-    else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
-    {
-        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
-        {
-            code += R"_(
-    float xi_f = (g_ind_1 * {{SCALE_X}});
-    float yi_f = (yo * {{SCALE_Y}});
-)_";
-        }
-        else
-        {
-            code += R"_(
-    float xi_f = ((g_ind_1 + 0.5f) * {{SCALE_X}} - 0.5f);
-    float yi_f = ((yo + 0.5f) * {{SCALE_Y}} - 0.5f);
-)_";
-        }
-
-        code += R"_(
-    const int xi = (int)floor(xi_f);
-    const int yi = (int)floor(yi_f);
-
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in00);
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in01);
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in10);
-    TILE({{SRC_DATA_TYPE}}, 1, N0, in11);
-
-    in00[0].v = {{CONSTANT_VALUE}};
-    in01[0].v = {{CONSTANT_VALUE}};
-    in10[0].v = {{CONSTANT_VALUE}};
-    in11[0].v = {{CONSTANT_VALUE}};
-
-    const int xi0  = clamp(xi, 0, (int){{src}}_w - 1);
-    const int yi0  = clamp(yi, 0, (int){{src}}_h - 1);
-    const int xi1  = clamp(xi + 1, 0, (int){{src}}_w - 1);
-    const int yi1  = clamp(yi + 1, 0, (int){{src}}_h - 1);
-
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in00);
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in01);
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in10);
-    T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11);
-)_";
-
-        if (is_data_type_float(_src->data_type()))
-        {
-            code += R"_(
-    const {{SRC_DATA_TYPE}} a  = ({{SRC_DATA_TYPE}})(xi_f - (float)xi);
-    const {{SRC_DATA_TYPE}} b  = ({{SRC_DATA_TYPE}})(1.f - a);
-    const {{SRC_DATA_TYPE}} a1 = ({{SRC_DATA_TYPE}})(yi_f - (float)yi);
-    const {{SRC_DATA_TYPE}} b1 = ({{SRC_DATA_TYPE}})(1.f - a1);
-
-    // Calculate the output
-    {{dst}}[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
-)_";
-        }
-        else
-        {
-            code += R"_(
-    const float a  = (xi_f - (float)xi);
-    const float b  = (1.f - a);
-    const float a1 = (yi_f - (float)yi);
-    const float b1 = (1.f - a1);
-
-    {{dst}}[0].v = CONVERT_SAT(
-        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
-        (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
-        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
-        (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0));
-)_";
-        }
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported interpolation policy");
-    }
-
-    code += R"_(
-    g_dst_indirect_y[0].v = g_ind_1 + (yo * (int)({{arg_dst}}_w)) + bout * (int)({{arg_dst}}_w * {{arg_dst}}_h);
-}
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
-)_";
-
-    return code;
-}
-
-void ClTemplateResize::declare_variables(GpuKernelVariableTable                            &vtable,
-                                         const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable                      &vtable,
-                                     const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    const auto dst_argument = vtable.get_variable(comp_group.get_any_dst_tensor());
-    lut["arg_dst"]          = dst_argument.uniq_name;
-
-    // Local build options
-    lut["meta_kernel_id"]  = id();
-    lut["SRC_DATA_TYPE"]   = get_cl_type_from_data_type(_src->data_type());
-    lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    lut["DST_DATA_TYPE"]   = get_cl_type_from_data_type(_dst->data_type());
-    lut["CONSTANT_VALUE"]  = string_from_pixel_value(0, _src->data_type());
-
-    const float scale_x =
-        scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
-    const float scale_y =
-        scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
-
-    lut["SCALE_X"] = float_to_string_with_full_precision(scale_x);
-    lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y);
-
-    return lut;
-}
-
-CLBuildOptions ClTemplateResize::get_build_options(const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
-{
-    const Window       root_window = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-    const unsigned int partial_n0  = _dst->dimension(0) % n0;
-
-    CLBuildOptions build_opts;
-
-    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
-    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_n0));
-
-    return build_opts;
-}
-
-std::string ClTemplateResize::get_config_id() const
-{
-    std::string config_id{};
-
-    config_id += "resize_";
-    config_id +=
-        (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
-    config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : "");
-    config_id += "_";
-    config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft");
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(0));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(1));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(2));
-    config_id += "_";
-    config_id += support::cpp11::to_string(_dst->dimension(3));
-
-    return config_id;
-}
-
-std::set<std::string> ClTemplateResize::get_headers_list() const
-{
-    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
-}
-
-Window ClTemplateResize::get_window() const
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
-
-    const unsigned int n0  = adjust_vec_size(16 / _src->element_size(), _src->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
-    return win.collapse(win, Window::DimZ);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h
deleted file mode 100644
index 4c69007185..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESIZE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESIZE
-
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateResize final : public IGpuTemplateComponentWriter
-{
-public:
-    using Attributes = ClComponentResize::Attributes;
-
-    /** Constructor
-     *
-     * @param[in] id         Component id
-     * @param[in] tensors    Tensor arguments to the components
-     * @param[in] attributes Component attributes
-     */
-    ClTemplateResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
-
-    /** Destructor */
-    ~ClTemplateResize() override = default;
-
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateResize(const ClTemplateResize &resize) = delete;
-
-    /** Prevent instances of this class from being copied */
-    ClTemplateResize &operator=(const ClTemplateResize &resize) = delete;
-
-    /** Allow instances of this class to be move constructed */
-    ClTemplateResize(ClTemplateResize &&resize) = default;
-
-    /** Allow instances of this class to be moved */
-    ClTemplateResize &operator=(ClTemplateResize &&resize) = default;
-
-    /** Generate kernel component name */
-    std::string get_name() const override;
-
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-    /** Generate the build options used in the component
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return CLBuildOptions Build options
-     */
-    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
-
-    /** Generate the component config id string used for tuning */
-    std::string get_config_id() const override;
-
-    /** Generate the header list used in the component */
-    std::set<std::string> get_headers_list() const override;
-
-    /** Generate the execution window for the component */
-    Window get_window() const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-    Attributes         _attributes;
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESIZE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
deleted file mode 100644
index d0ec91e0a9..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateStore.h"
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
-{
-    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-}
-
-std::string ClTemplateStore::get_name() const
-{
-    return "store";
-}
-
-std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-
-    return R"_(
-//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
-{
-    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
-    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, g_dst_indirect_y);
-//------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
-}
-
-)_";
-}
-
-void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "src");
-    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-                            "dst");
-}
-
-TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
-{
-    TagLUT lut{};
-
-    // Arguments and global shared variables
-    lut["src"] = vtable.get_variable(_src);
-    lut["dst"] = vtable.get_variable(_dst);
-
-    // Local build options
-    lut["meta_kernel_id"]  = id();
-    lut["DST_TENSOR_TYPE"] = "BUFFER";
-    lut["DST_DATA_TYPE"]   = _dst->data_type();
-
-    ARM_COMPUTE_UNUSED(comp_group);
-    return lut;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
deleted file mode 100644
index b8c82ceadd..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
-
-#include "arm_compute/core/experimental/Types.h"
-
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClTemplateStore final : public IGpuTemplateComponentWriter
-{
-public:
-    /** Constructor
-     *
-     * @param[in] id      Component id
-     * @param[in] tensors Tensor arguments to the components
-     */
-    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
-    /** Prevent instances of this class from being copy constructed */
-    ClTemplateStore(const ClTemplateStore &store) = delete;
-    /** Prevent instances of this class from being copied */
-    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
-    /** Allow instances of this class to be move constructed */
-    ClTemplateStore(ClTemplateStore &&store) = default;
-    /** Allow instances of this class to be moved */
-    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
-    /** Generate kernel component name */
-    std::string get_name() const override;
-    /** Generate kernel component code template
-     *
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return std::string Component code
-     */
-    std::string get_component_code(const ComponentGroup &comp_group) const override;
-    /** Declare all variables used by the component in the @p vtable
-     *
-     * @param[out] vtable     Variable table
-     * @param[in]  comp_group Component group of which the component is a part of
-     */
-    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-    /** Generate the tag look-up table used to instantiate the component code.
-     *
-     * @param[in] vtable     Variable table
-     * @param[in] comp_group Component group of which the component is a part of
-     *
-     * @return TagLUT  Tag lookup table
-     */
-    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
-
-private:
-    const ITensorInfo *_src;
-    const ITensorInfo *_dst;
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
deleted file mode 100644
index d3d7c8db83..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright (c) 2022-2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "ClTemplateWriter.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
-std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
-{
-    std::string replaced_code    = "";
-    bool        scanning_pattern = false;
-    std::string pattern_found    = "";
-    for (size_t i = 0; i < code_template.size() - 1; ++i)
-    {
-        if (!scanning_pattern)
-        {
-            if (code_template[i] == '{' && code_template[i + 1] == '{')
-            {
-                i += 1;
-                scanning_pattern = true;
-                pattern_found    = "";
-            }
-            else
-            {
-                replaced_code += code_template[i];
-            }
-        }
-        else
-        {
-            if (code_template[i] == '}' && code_template[i + 1] == '}')
-            {
-                i += 1;
-                scanning_pattern = false;
-                std::string err  = "Pattern " + pattern_found + " not found in tags";
-                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
-                replaced_code += tags.find(pattern_found)->second.value;
-            }
-            else
-            {
-                pattern_found += code_template[i];
-            }
-        }
-    }
-
-    return replaced_code;
-}
-ClTemplateWriter::~ClTemplateWriter()
-{
-}
-ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components}
-{
-}
-std::string ClTemplateWriter::get_name()
-{
-    return write_kernel_name();
-}
-std::string ClTemplateWriter::get_code()
-{
-    return write_code();
-}
-std::string ClTemplateWriter::get_config_id()
-{
-    std::string config_id = get_name();
-    for (const auto &comp : _components)
-    {
-        config_id += "--" + comp->template_writer()->get_config_id() + "--";
-    }
-
-    return config_id;
-}
-
-CLBuildOptions ClTemplateWriter::get_build_options()
-{
-    CLBuildOptions build_opts{};
-
-    for (const auto &comp : _components)
-    {
-        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
-    }
-
-    return build_opts;
-}
-
-Window ClTemplateWriter::get_window() const
-{
-    const auto root_comp = _components.get_root_component();
-    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
-    return root_comp->template_writer()->get_window();
-}
-
-std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
-{
-    // Assemble GpuKernelArguments
-    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for (const auto t : _components.get_argument_tensors())
-    {
-        tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info});
-    }
-    return tensors;
-}
-
-std::string ClTemplateWriter::write_code()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
-
-    // These data structures will hold the data from all the components in the blueprint
-    std::set<std::string>    headers_list{};
-    std::set<std::string>    additional_macros{};
-    std::vector<std::string> component_codes{}; // vector because order matters
-
-    // Pass 1: Declare all kernel variables
-    for (auto &component : _components)
-    {
-        component->template_writer()->declare_variables(_vtable, _components);
-    }
-    // Pass 2: Generate component codes
-    for (auto &component : _components)
-    {
-        const auto component_writer       = component->template_writer();
-        auto       curr_headers_list      = component_writer->get_headers_list();
-        auto       curr_additional_macros = component_writer->get_additional_macros();
-        auto       curr_component_code    = component_writer->get_component_code(_components);
-        const auto var_lut                = component_writer->get_tag_lut(
-                           _vtable,
-                           _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
-        component_codes.push_back(replace_tags(curr_component_code, var_lut));
-
-        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
-        if (!additional_macros.empty()) // Some components might not have any
-        {
-            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
-        }
-    }
-
-    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
-    std::string code = "";
-
-    for (auto &header : headers_list)
-    {
-#if defined(EMBEDDED_KERNELS)
-        code += CLKernelLibrary::get().get_program(header).first;
-#else  // defined(EMBEDDED_KERNELS)
-        code += "#include \"" + header + "\"\n";
-#endif // defined(EMBEDDED_KERNELS)
-    }
-
-    for (auto &macros : additional_macros)
-    {
-        code += macros;
-    }
-
-    auto arguments = _components.get_argument_tensors();
-    std::sort(arguments.begin(), arguments.end(),
-              [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); });
-    code += write_kernel_signature(_vtable.get_variable_list(arguments));
-
-    code += "\n{\n\n";
-
-    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
-    code += write_global_section();
-    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
-
-    {
-        const auto        tiles = _components.get_tiles();
-        std::stringstream tiles_ss;
-
-        tiles_ss << "    //------------------ START TILE DECLARATION ---------------------\n";
-
-        for (auto tile : tiles)
-        {
-            const auto var       = _vtable.get_variable(tile);
-            const auto data_type = get_cl_type_from_data_type(tile->data_type());
-            const auto var_name  = var.uniq_name;
-
-            tiles_ss << "    TILE(" << data_type << ", M0, N0, " << var_name << ");\n";
-        }
-
-        tiles_ss << "    //------------------ END TILE DECLARATION ---------------------\n";
-
-        code += tiles_ss.str();
-    }
-
-    for (const auto &component_code : component_codes)
-    {
-        code += component_code;
-        code += "\n";
-    }
-
-    code += "}\n";
-
-    return code;
-}
-std::string ClTemplateWriter::write_global_section() const
-{
-    const auto dst_info   = _components.get_any_dst_tensor();
-    const auto dst_w      = dst_info->dimension(0);
-    const auto tile_w     = std::max(1, get_window().x().step());
-    const auto tile_h     = std::max(1, get_window().y().step());
-    auto       leftover_w = dst_w % tile_w;
-
-    std::string code = "";
-    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " +
-            std::to_string(leftover_w) + ");\n";
-    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
-    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
-
-    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
-    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
-
-    return code;
-}
-std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
-{
-    std::string code;
-    switch (var.kernel_argument_info.type)
-    {
-        case GpuKernelArgumentInfo::Type::Vector:
-        {
-            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image:
-        {
-            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_3D:
-        {
-            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
-            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
-        {
-            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
-            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
-        {
-            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-        {
-            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
-            break;
-        }
-        case GpuKernelArgumentInfo::Type::Tensor_3D:
-        {
-            code += "\n    TENSOR3D_DECLARATION(" + var.uniq_name + ")";
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
-        }
-    }
-    return code;
-}
-std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
-{
-    std::string code = "\n__kernel void " + write_kernel_name() + "(";
-
-    for (int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
-    {
-        code += write_argument_declaration(argument_list[i]) + ",";
-    }
-    if (static_cast<int>(argument_list.size()) - 1 >= 0)
-    {
-        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
-    }
-
-    code += ')';
-
-    return code;
-}
-std::string ClTemplateWriter::write_kernel_name() const
-{
-    if (_components.empty())
-    {
-        return "empty_kernel";
-    }
-    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
-    for (size_t i = 1; i < _components.size(); ++i)
-    {
-        name += "___";
-        name += _components[i]->template_writer()->get_name();
-    }
-
-    return name;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
deleted file mode 100644
index 83f617b6c6..0000000000
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
-#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
-
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Use a templated-string-based method to write kernel code
- *  It stitches the component code templates together based on the valid fusion configuration.
- *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
- */
-class ClTemplateWriter : public IGpuKernelWriter
-{
-public:
-    /** Instantiates a kernel code string from the kernel code template
-     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
-     *
-     * @param[in] code_template Kernel code template
-     * @param[in] tags          Tag lookup table
-     *
-     * @return std::string  Instantiated kernel string
-     */
-    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
-    /** Default constructor */
-    ClTemplateWriter() = default;
-    /** Constructor
-     *
-     * @param[in] components Kernel component group from which the kernel will be generated
-     */
-    ClTemplateWriter(const GpuKernelComponentGroup &components);
-    /** Destructor */
-    ~ClTemplateWriter() override;
-    /** Generate kernel name */
-    std::string get_name() override;
-    /** Generate kernel code */
-    std::string get_code() override;
-    /** Generate build options */
-    CLBuildOptions get_build_options() override;
-    /** Generate config id string of the entire kernel. This is used for tuning */
-    std::string get_config_id() override;
-    /** Generate execution window */
-    Window get_window() const override;
-    /** Get the kernel argument lists of the kernel*/
-    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
-
-private:
-    std::string write_kernel_name() const;
-    std::string write_code();
-    std::string write_global_section() const;
-    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
-    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
-
-private:
-    GpuKernelComponentGroup _components{};
-    GpuKernelVariableTable  _vtable{};
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index 80dcaa8f90..453983c077 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -63,7 +63,7 @@ namespace validation
 TEST_SUITE(CL)
 TEST_SUITE(INTEGRATION)
 TEST_SUITE(DYNAMIC_FUSION)
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Conv2d is not ported to ckw yet. COMPMID-6259
+
 TEST_CASE(Conv2d, framework::DatasetMode::ALL)
 {
     /* Computation:
@@ -156,7 +156,7 @@ TEST_CASE(Conv2d, framework::DatasetMode::ALL)
         0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
     validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
 }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
+
 TEST_CASE(Add_Output_Add_Output, framework::DatasetMode::ALL)
 {
     /* Computation:
@@ -368,8 +368,9 @@ TEST_CASE(Add_Output_Add_Cast_Cast_Output, framework::DatasetMode::ALL)
     validate(CLAccessor(t_out_1), ref_t_out_1, tolerance_cast_f32);
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Conv2d is not ported to ckw yet. COMPMID-6259
-TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::ALL)
+/// TODO: COMPMID-6593 : This integration test fails with CKW backend.
+/// It was not enabled for CKW before, therefore went unnoticed.
+TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::DISABLED)
 {
     //   (tensor0)
     //       |
@@ -580,7 +581,6 @@ TEST_CASE(Conv2d_Sigmoid_DepthwiseConv2d_Mul, framework::DatasetMode::ALL)
     constexpr RelativeTolerance<float> tolerance(0.001f);
     validate(CLAccessor(tensor6), ref_mul_dst_nchw, tolerance);
 }
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 TEST_SUITE(Invalid_Fusion_Should_Fail)
 TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
index 40e1ea8929..2f8c639cea 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp
@@ -290,7 +290,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 {
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
+
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuDepthwiseConv2dFixture<half>,
@@ -313,7 +313,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Generic)
@@ -336,7 +335,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 {
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
+
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuDepthwiseConv2dFixture<half>,
@@ -359,7 +358,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // Generic
 TEST_SUITE_END() // FP16
 
@@ -385,7 +383,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 
 FIXTURE_DATA_TEST_CASE(RunSmall,
@@ -409,7 +406,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // W3x3
 
 TEST_SUITE(Generic)
@@ -445,7 +441,6 @@ FIXTURE_DATA_TEST_CASE(RunLargeKernelSize,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test as dilation not supported yet in DepthwiseConv2d CKW kernel
 TEST_SUITE(Dilation)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuDepthwiseConv2dFixture<float>,
@@ -468,7 +463,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 TEST_SUITE_END() // Dilation
-#endif           // ACL_INTERNAL_TEST_CKW_IN_DF
 TEST_SUITE_END() // Generic
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
diff --git a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
index 96b79679c3..82d66ca6ce 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/MatMul.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/LargeMatMulDataset.h"
@@ -333,4 +333,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
index e537826c71..be816b32b3 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Pool2d.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifdef ACL_INTERNAL_TEST_CKW_IN_DF
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 
 #include "tests/CL/CLAccessor.h"
@@ -217,4 +217,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
index 43617fe1be..a1495cf014 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Reshape.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
+
 #include "tests/CL/CLAccessor.h"
 #include "tests/datasets/ReshapeLayerDataset.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -82,7 +82,7 @@ using DynamicFusionGpuReshapeLayerFixture =
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<float>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::F32)))
 {
@@ -94,7 +94,7 @@ TEST_SUITE_END() // F32
 TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<half>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::F16)))
 {
@@ -106,7 +106,7 @@ TEST_SUITE_END() // F16
 TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<uint8_t>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::U8)))
 {
@@ -118,7 +118,7 @@ TEST_SUITE_END() // U8
 TEST_SUITE(S8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<int8_t>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::S8)))
 {
@@ -130,7 +130,7 @@ TEST_SUITE_END() // S8
 TEST_SUITE(S16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        DynamicFusionGpuReshapeLayerFixture<int16_t>,
-                       framework::DatasetMode::ALL,
+                       framework::DatasetMode::DISABLED,
                        combine(datasets::SmallReshapeLayerDataset(),
                                framework::dataset::make("DataType", DataType::S16)))
 {
@@ -145,5 +145,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF
diff --git a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
index b7cb6bace6..8f5a1ed14a 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/Softmax.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
+
 #include "arm_compute/core/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
 
@@ -46,62 +46,70 @@ namespace validation
 RelativeTolerance<half>  tolerance_f16(half(0.2));
 RelativeTolerance<float> tolerance_f32(0.001f);
 
+using framework::dataset::make;
+
+/// TODO: COMPMID-6713
+/// Softmax is not implemented in CKW. Therefore, the tests are DISABLED.
+/// Enable the tests when Softmax is implemented in CKW.
+
 TEST_SUITE(CL)
 TEST_SUITE(DYNAMIC_FUSION)
 TEST_SUITE(SOFTMAX)
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
-                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::S32), // Unsupported data type
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-
-                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM16), // Unsupported data type
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
-
-                                                     })),
-               framework::dataset::make("beta", { 1.0,
-                                                  2.0,
-                                                  2.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                  1.0,
-                                                })),
-               framework::dataset::make("axis", {
-                                                  0,
-                                                  0,
-                                                  1,  // Invalid as axis != 0
-                                                  0,
-                                                  0,
-                                                  0,
-                                                  -3, // Invalid as axis != 0
-                                                  2,  // Invalid as axis != 0
-                                                  1,  // Invalid as axis != 0
-                                                  -1, // Invalid as axis != 0
-                                                })),
-               framework::dataset::make("Expected", { false, false, false, true, false, false, false, false, false, false})),
-               input_info, output_info, beta, axis, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::DISABLED,
+    zip(
+        make("InputInfo", {
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching data types
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F32), // Mismatching shapes
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::S32), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("OutputInfo",{
+            TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+            TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM16), // Unsupported data type
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+            TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+        }),
+        make("beta", {
+            1.0,
+            2.0,
+            2.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+            1.0,
+        }),
+        make("axis", {
+            0,
+            0,
+            1,  // Invalid as axis != 0
+            0,
+            0,
+            0,
+            -3, // Invalid as axis != 0
+            2,  // Invalid as axis != 0
+            1,  // Invalid as axis != 0
+            -1, // Invalid as axis != 0
+        }),
+        make("Expected", { false, false, false, true, false, false, false, false, false, false})),
+        input_info, output_info, beta, axis, expected)
 {
     // Create a new workload sketch
     CLCompileContext   cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
@@ -122,33 +130,39 @@ using DynamicFusionSoftmaxLayerFixture = DynamicFusionSoftmaxValidationFixture<C
 TEST_SUITE(FLOAT)
 TEST_SUITE(FP32)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 
-FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 
 
-FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -156,33 +170,39 @@ FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<float>, framework
 TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 
 
-FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 
 
-FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
-                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 0 })),
-                                                                                                   framework::dataset::make("is_log", {false, true})))
+FIXTURE_DATA_TEST_CASE(Run4D, DynamicFusionSoftmaxLayerFixture<half>, framework::DatasetMode::DISABLED,
+    combine(
+        datasets::SoftmaxLayer4DShapes(),
+        make("DataType", DataType::F16),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0 }),
+        make("is_log", {false, true})))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -197,5 +217,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF