Use the stable CKW API in the GPU dynamic fusion backend

- Refactor all kernels to work with the CKW stable API - Add support for sub-tile in the op_load/op_store CKW operator - Fix mismatch in resize - Add comments in all kernels written with CKW to help developers understand the structure of the code - Add texture image support in depthwise convolution written with CKW - Add support for different block sizes in depthwise convolution - Remove the use of the dynamic fusion helper functions. - Add support for floor in the op_unary() of CKW Resolves: COMPMID-6708, COMPMID-6743, COMPMID-6530 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Signed-off-by: Jakub Sujak <jakub.sujak@arm.com> Change-Id: I8104ce4d04a3138a1aeb0b84940e1f1c89e76069 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10914 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Gunes Bayir <gunes.bayir@arm.com> 2024-01-17 16:07:03 +0000
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2024-02-01 16:00:34 +0000
commit: 2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c (patch)
tree: a4e2d5ce46443a79a0778e4960462ce3edf106ec /src/dynamic_fusion
parent: 7ab7fca87cca8775f82b0e9efec6a40975910c17 (diff)
download: ComputeLibrary-2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c.tar.gz
35 files changed, 2553 insertions, 1836 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
index c4ab110c92..a42b39700c 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
-#include "ckw/Error.h"
+#include "compute_kernel_writer/include/ckw/Error.h"
 
 namespace arm_compute
 {
@@ -33,75 +33,71 @@ namespace experimental
 namespace dynamic_fusion
 {
 
-GpuCkwComponentArgument::GpuCkwComponentArgument()
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand tensor) : _tensor(tensor)
 {
 }
 
-GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
-{
-}
-
-GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
-                                                                      const ckw::TensorTileSampler &tile_sampler)
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand         &tile,
+                                                                      const ckw::TensorSampler &sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 
-    _tile         = &tile;
-    _tile_sampler = tile_sampler;
+    _tile    = tile;
+    _sampler = sampler;
 
     return *this;
 }
 
 bool GpuCkwComponentArgument::has_tensor() const
 {
-    return _tensor != nullptr;
+    return _tensor.is_valid();
 }
 
 ckw::TensorOperand &GpuCkwComponentArgument::tensor()
 {
-    CKW_ASSERT(_tensor != nullptr);
+    CKW_ASSERT(_tensor.is_valid());
 
-    return *_tensor;
+    return _tensor;
 }
 
 const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const
 {
-    CKW_ASSERT(_tensor != nullptr);
+    CKW_ASSERT(_tensor.is_valid());
 
-    return *_tensor;
+    return _tensor;
 }
 
 bool GpuCkwComponentArgument::has_tile() const
 {
-    return _tile != nullptr;
+    return _tile.is_valid();
 }
 
 ckw::TileOperand &GpuCkwComponentArgument::tile()
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return *_tile;
+    return _tile;
 }
 
 const ckw::TileOperand &GpuCkwComponentArgument::tile() const
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return *_tile;
+    return _tile;
 }
 
-ckw::TensorTileSampler &GpuCkwComponentArgument::tile_sampler()
+ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler()
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return _tile_sampler;
+    return _sampler;
 }
 
-const ckw::TensorTileSampler &GpuCkwComponentArgument::tile_sampler() const
+const ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() const
 {
-    CKW_ASSERT(_tile != nullptr);
+    CKW_ASSERT(_tile.is_valid());
 
-    return _tile_sampler;
+    return _sampler;
 }
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
index 863989a7bd..7a57c81e5f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,9 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
 
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class TensorOperand;
-class TileOperand;
-} // namespace ckw
+#include "compute_kernel_writer/include/ckw/TensorOperand.h"
+#include "compute_kernel_writer/include/ckw/TensorSampler.h"
+#include "compute_kernel_writer/include/ckw/TileOperand.h"
 
 namespace arm_compute
 {
@@ -44,24 +40,27 @@ namespace dynamic_fusion
 class GpuCkwComponentArgument
 {
 public:
-    /** Initialize a new instance of @ref GpuCkwComponentArgument class for empty virtual tensor. */
-    GpuCkwComponentArgument();
+    /** Default constructor */
+    GpuCkwComponentArgument() = default;
 
     /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor.
      *
      * @param[in] tensor The user tensor.
      */
-    explicit GpuCkwComponentArgument(ckw::TensorOperand &tensor);
+    explicit GpuCkwComponentArgument(ckw::TensorOperand tensor);
 
-    /** Set virtual tensor information (tile, sampler) for the argument.
+    /** Bind the tile and sampler to the tensor argument.
      *
-     * If the component is a user tensor, it can be treated as virtual tensor as well
-     * and won't be loaded again using @ref GpuCkwKernelWriter::op_load_once method.
+     * This method can be used to share a tile and sampler associated to a tensor
+     * among different kernel components. For example, when we create the destination
+     * tile and destination sampler for the first time (root component), this method can be
+     * used to bind these two information to the destination tensor so that the following
+     * simple components know the tile size and how to access the elements from memory.
      *
      * @param[in] tile    The tile that has been loaded.
      * @param[in] sampler The tensor sampling information that has been used to load the tile.
      */
-    GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &sampler);
+    GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorSampler &sampler);
 
     /** Get whether the argument is a user tensor. */
     bool has_tensor() const;
@@ -101,18 +100,18 @@ public:
      *
      * If the tile is not available, throw an error.
      */
-    ckw::TensorTileSampler &tile_sampler();
+    ckw::TensorSampler &tensor_sampler();
 
     /** Get the tensor sampling information for the tile.
      *
      * If the tile is not available, throw an error.
      */
-    const ckw::TensorTileSampler &tile_sampler() const;
+    const ckw::TensorSampler &tensor_sampler() const;
 
 private:
-    ckw::TensorOperand    *_tensor{nullptr};
-    ckw::TileOperand      *_tile{nullptr};
-    ckw::TensorTileSampler _tile_sampler{};
+    ckw::TensorOperand _tensor{};
+    ckw::TileOperand   _tile{};
+    ckw::TensorSampler _sampler{};
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index c927f32bde..a0e5e16aa0 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,13 +28,17 @@
 
 #include "src/common/utils/Log.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/TargetArchitecture.h"
+#include "compute_kernel_writer/include/ckw/types/TargetLanguage.h"
+
 using namespace ckw;
+
 namespace arm_compute
 {
 namespace experimental
@@ -42,21 +46,22 @@ namespace experimental
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{}
 {
+    _components = components;
+
     // Generate kernel name
-    std::string name = "";
+    std::string kernel_name;
     for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
-        name += ckw_driver->get_name(_components) + "__";
+        kernel_name += ckw_driver->get_name(_components) + "__";
     }
 
     // Generate kernel code
-    _kernel.name(name);
-    GpuCkwKernelWriter       root_writer(_kernel);
-    GpuCkwScopedKernelWriter writer(&root_writer);
+    auto root_writer =
+        KernelWriter::create_instance(ckw::TargetArchitecture::GpuArmMaliValhall, ckw::TargetLanguage::OpenCL);
+    GpuCkwScopedKernelWriter writer(root_writer.get());
     GpuCkwVariableTable      vtable{};
 
     for (auto &comp : _components)
@@ -65,22 +70,27 @@ GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
         ckw_driver->write_component_code(_components, vtable, writer);
     }
-    _code = root_writer.generate_code();
+    auto kernel = root_writer->emit_kernel(kernel_name);
+
+    // Set the kernel name, kernel arguments and source code
+    _kernel_name = kernel_name;
+    _kernel_args = kernel->arguments();
+    _kernel_code = kernel->source_code();
 }
 
 std::string GpuCkwDriver::get_name()
 {
-    return _kernel.name();
+    return _kernel_name;
 }
 
 std::string GpuCkwDriver::get_code()
 {
-    return _code;
+    return _kernel_code;
 }
 
 std::string GpuCkwDriver::get_config_id()
 {
-    std::string id = "";
+    std::string id;
     for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
@@ -100,7 +110,7 @@ Window GpuCkwDriver::get_window() const
 GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
     GpuKernelArgumentList args{};
-    for (const auto &arg : _kernel.arguments())
+    for (const auto &arg : _kernel_args)
     {
         switch (arg.type())
         {
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 2ca5fb435c..b80ce0d816 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
 
 #include "ckw/Kernel.h"
 
@@ -30,6 +30,8 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
+#include "compute_kernel_writer/include/ckw/Kernel.h"
+#include "compute_kernel_writer/include/ckw/KernelArgument.h"
 #include <map>
 #include <string>
 
@@ -49,7 +51,7 @@ class GpuCkwDriver : public IGpuKernelWriter
 {
 public:
     /** Default constructor */
-    GpuCkwDriver() = default;
+    GpuCkwDriver() = delete;
     /** Constructor
      *
      * @param[in] components Kernel component group from which the kernel will be generated
@@ -69,13 +71,14 @@ public:
     GpuKernelArgumentList get_kernel_arguments() override;
 
 private:
-    GpuKernelComponentGroup _components{};
-    ckw::Kernel             _kernel;
-    std::string             _code;
+    GpuKernelComponentGroup          _components{};
+    std::string                      _kernel_name{};
+    std::vector<ckw::KernelArgument> _kernel_args{};
+    std::string                      _kernel_code{};
 };
 
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
index cbadbd9639..ae12d13e5a 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-
 namespace arm_compute
 {
 namespace experimental
@@ -33,34 +31,34 @@ namespace experimental
 namespace dynamic_fusion
 {
 
-GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(GpuCkwKernelWriter *writer)
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(ckw::KernelWriter *writer)
     : _writer(writer), _parent_id_space(writer->id_space())
 {
-    _writer->next_id_space();
+    _writer->new_id_space();
 }
 
 GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other)
     : _writer(other._writer), _parent_id_space(other._writer->id_space())
 {
-    _writer->next_id_space();
+    _writer->new_id_space();
 }
 
-GpuCkwKernelWriter *GpuCkwScopedKernelWriter::operator->()
+ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->()
 {
     return _writer;
 }
 
-const GpuCkwKernelWriter *GpuCkwScopedKernelWriter::operator->() const
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() const
 {
     return _writer;
 }
 
-GpuCkwKernelWriter *GpuCkwScopedKernelWriter::writer()
+ckw::KernelWriter *GpuCkwScopedKernelWriter::writer()
 {
     return _writer;
 }
 
-const GpuCkwKernelWriter *GpuCkwScopedKernelWriter::writer() const
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() const
 {
     return _writer;
 }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
index 81049bfe37..84dd706cd0 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
 #include <cstdint>
 
 namespace arm_compute
@@ -34,14 +35,12 @@ namespace experimental
 namespace dynamic_fusion
 {
 
-class GpuCkwKernelWriter;
-
 /** Helper to automatically manage kernel writer ID space. */
 class GpuCkwScopedKernelWriter
 {
 public:
     /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */
-    explicit GpuCkwScopedKernelWriter(GpuCkwKernelWriter *writer);
+    explicit GpuCkwScopedKernelWriter(ckw::KernelWriter *writer);
 
     /** Create a new scope from the specified scoped kernel writer. */
     GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other);
@@ -50,20 +49,20 @@ public:
     GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete;
 
     /** Access the underlying kernel writer. */
-    GpuCkwKernelWriter *operator->();
+    ckw::KernelWriter *operator->();
 
     /** Access the underlying kernel writer. */
-    const GpuCkwKernelWriter *operator->() const;
+    const ckw::KernelWriter *operator->() const;
 
     /** Get the kernel writer. */
-    GpuCkwKernelWriter *writer();
+    ckw::KernelWriter *writer();
 
     /** Get the kernel writer. */
-    const GpuCkwKernelWriter *writer() const;
+    const ckw::KernelWriter *writer() const;
 
 private:
-    GpuCkwKernelWriter *_writer;
-    int32_t             _parent_id_space;
+    ckw::KernelWriter *_writer;
+    int32_t            _parent_id_space;
 };
 
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 88a0cf7f43..66ccc1ac34 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
@@ -40,7 +39,6 @@ namespace dynamic_fusion
 GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
                                                                GpuCkwScopedKernelWriter      &writer,
                                                                const ITensorInfo             *tensor,
-                                                               TensorStorageType              storage,
                                                                const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
@@ -65,7 +63,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))};
+        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor))};
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 2b118911b8..fc8764c3e2 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
 
 #include "arm_compute/core/ITensorInfo.h"
 
@@ -38,7 +38,6 @@ namespace dynamic_fusion
 {
 class GpuKernelComponentGroup;
 class GpuCkwScopedKernelWriter;
-enum class TensorStorageType;
 
 /** A table of all the variables used in the kernel.
  *
@@ -54,7 +53,6 @@ public:
      * @param[in] comp_group Component group the tensor belongs to
      * @param[in] writer     Compute Kernel Writer
      * @param[in] tensor     Tensor info with which the new variable is associated
-     * @param[in] storage    Tensor storage type associated with the tensor
      * @param[in] alias      Alias for the variable. Will be used as part of the variable name
      *
      * @return GpuCkwComponentArgument*
@@ -62,7 +60,6 @@ public:
     GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
                                               GpuCkwScopedKernelWriter      &writer,
                                               const ITensorInfo             *tensor,
-                                              TensorStorageType              storage,
                                               const std::string             &alias = "unnamed");
 
 private:
@@ -72,4 +69,4 @@ private:
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index c3b1b3c8bc..68f478a7ce 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,69 +26,29 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-namespace
-{
-/** Create a simple sampler from tile of dimension [m0, n0]
- */
-inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_t m0, int32_t n0)
-{
-    TensorTileSampler sampler;
-
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &x_coord = writer->declare_tile("x_coord", ckw::DataType::Int32);
-    auto &y_coord = writer->declare_tile("y_coord", ckw::DataType::Int32);
-    auto &m0_t    = writer->declare_tile("m0", m0);
-    auto &n0_t    = writer->declare_tile("n0", n0);
-    writer->op_binary_expression(x_coord, gid_0, BinaryOp::Mul, n0_t);
-    writer->op_binary_expression(y_coord, gid_1, BinaryOp::Mul, m0_t);
-
-    sampler.x(x_coord);
-    sampler.y(y_coord);
-    sampler.z(const_0); // 3rd dimension collapsed with 2nd dimension
-    sampler.b(gid_2);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip); // Dimensions higher than 3 not supported yet
-
-    return sampler;
-}
-} // namespace
 
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
-                                   const Attributes                &attributes)
+                                   const Attributes                &attributes) // NOLINT
     : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
@@ -100,78 +60,223 @@ void GpuCkwActivation::write_component_code(const ComponentGroup    &comp_group,
                                             GpuCkwVariableTable     &vtable,
                                             GpuCkwScopedKernelWriter writer) const
 {
-    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h  = static_cast<int32_t>(_dst->dimension(1));
+    const auto dst_dt = to_ckw(_dst->data_type());
+
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_neg_1_fp  = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt));
+    auto const_pos_1_fp  = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt));
+    auto const_0_fp      = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_A_fp      = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt));
+    auto const_B_fp      = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
+
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    // Shift-back for the overlapping-min strategy
+    int32_t dst_shift_back = -1;
+
+    if (!dst->has_tile())
+    {
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
 
-    load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
 
-    auto &src_tile = src->tile();
-    auto &dst_tile = dst->tile();
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
+    }
+    else
+    {
+        // dst_m0_partial depends on the TensorSamplerFormat
+        dst_n0         = dst->tile().tile_info().width();
+        dst_m0         = dst->tile().tile_info().height();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+        ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+        if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        }
+        else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            dst_m0_partial = _dst->dimension(1) % dst_m0;
+        }
+
+        // Shift-back for the overlapping-min strategy
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+    }
+
+    const auto &tile_dst = dst->tile();
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_dst_shift_back_n0 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    if (!src->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0,
+                                                const_0_i32);
+        get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0);
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+
+        auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+        src->init_virtual_tensor(tile_src, sampler_src);
+    }
 
-    // Constants
-    const auto &constant_minus_1 = writer->declare_tile("minus_1", -1);
-    const auto &constant_pos_1   = writer->declare_tile("one", 1);
-    const auto &constant_zero    = writer->declare_tile("zero", 0);
-    const auto &constant_A       = writer->declare_tile("A_VAL", _attributes.a());
-    const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
+    const auto &tile_src = src->tile();
 
-    // Perform the operation.
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
     switch (_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
             // dst = src * -1
-            writer->op_binary_expression(dst_tile, src_tile, BinaryOp::Mul, constant_minus_1);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp);
             // dst = exp(src * -1)
-            writer->op_unary_elementwise_function(dst_tile, UnaryFunction::Exp, dst_tile);
+            writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst);
             // dst = 1 + (exp(src * -1))
-            writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, constant_pos_1);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp);
             // dst = 1 /  1 + (exp(src * -1))
-            writer->op_binary_expression(dst_tile, constant_pos_1, BinaryOp::Div, dst_tile);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::TANH:
         {
             // dst = B_VAL * src
-            writer->op_binary_expression(dst_tile, src_tile, BinaryOp::Mul, constant_B);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_B_fp);
             // dst = tanh(B_VAL * src)
-            writer->op_unary_elementwise_function(dst_tile, UnaryFunction::Tanh, dst_tile);
+            writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_dst);
             // dst = A_VAL * tanh(B_VAL * src)
-            writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Mul, constant_A);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_dst, const_A_fp);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::RELU:
         {
             // dst = max(src, 0)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Max, src_tile, constant_zero);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
         {
             //dst = max(src, 0)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Max, src_tile, constant_zero);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
             //dst = min(max(src, 0), A_VAL)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Min, dst_tile, constant_A);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
             break;
         }
         case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
         {
             //dst = max(src, B_VAL)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Max, src_tile, constant_B);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp);
             //dst = min(max(src, B_VAL), A_VAL)
-            writer->op_binary_elementwise_function(dst_tile, ckw::BinaryFunction::Min, dst_tile, constant_A);
+            writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
             break;
         }
         default:
             CKW_ASSERT(false);
             break;
     }
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwActivation::get_window() const
@@ -182,8 +287,8 @@ Window GpuCkwActivation::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int     num_elems_processed_per_iteration =
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
         adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
     Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index e8e5087633..d3e0dbafd4 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,65 +26,25 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-namespace
-{
-/** Create a simple sampler from tile of dimension [m0, n0]
- */
-inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_t m0, int32_t n0)
-{
-    TensorTileSampler sampler;
-
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    auto &const_0 = writer->declare_tile("0", 0);
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &x_coord = writer->declare_tile("x_coord", ckw::DataType::Int32);
-    auto &y_coord = writer->declare_tile("y_coord", ckw::DataType::Int32);
-    auto &m0_t    = writer->declare_tile("m0", m0);
-    auto &n0_t    = writer->declare_tile("n0", n0);
-    writer->op_binary_expression(x_coord, gid_0, BinaryOp::Mul, n0_t);
-    writer->op_binary_expression(y_coord, gid_1, BinaryOp::Mul, m0_t);
-
-    sampler.x(x_coord);
-    sampler.y(y_coord);
-    sampler.z(const_0); // 3rd dimension collapsed with 2nd dimension
-    sampler.b(gid_2);
-
-    sampler.width(n0);
-    sampler.height(m0);
-
-    sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::ClampToBorder);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::Skip); // Dimensions higher than 3 not supported yet
-
-    return sampler;
-}
-} // namespace
 
 GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
     : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
@@ -92,72 +52,187 @@ GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors,
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+    ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false,
+                             "The source data type must be a floating-point data type");
 }
 
 void GpuCkwCast::write_component_code(const ComponentGroup    &comp_group,
                                       GpuCkwVariableTable     &vtable,
                                       GpuCkwScopedKernelWriter writer) const
 {
-    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
 
-    // Load the source tile and prepare the sampler.
-    if (!src->has_tile())
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
+
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
+
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    // Shift-back for the overlapping-min strategy
+    int32_t dst_shift_back = -1;
+
+    if (!dst->has_tile())
     {
-        const auto sampler = create_sampler(writer, m0, n0);
-        writer->op_load_once(src, sampler);
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        ckw::DataType dst_dt   = to_ckw(_dst->data_type());
+        auto          tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
     }
     else
     {
-        const auto &sampler = src->tile_sampler();
-        writer->op_load_once(src, sampler);
+        // Change dst_n0 and dst_m0 if NOT root component!
+        // ATTENTION:
+        // dst_m0_partial depends on the TensorSamplerFormat
+        dst_n0         = dst->tile().tile_info().width();
+        dst_m0         = dst->tile().tile_info().height();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+        ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+        if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+        }
+        else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            dst_m0_partial = _dst->dimension(1) % dst_m0;
+        }
+
+        // Shift-back for the overlapping-min strategy
+        dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
     }
 
-    const auto &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
+    const auto &tile_dst = dst->tile();
 
-    // Prepare the output tile.
-    if (!dst->has_tile())
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_dst_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    if (!src->has_tile())
     {
-        // Get Target datatype and convert it to ckw::DataType.
-        ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
+        // Sampler
+        ckw::TensorSampler sampler_src = dst->tensor_sampler();
 
-        // Create dst_tile based on src_tile dimensions and with target DataType.
-        const TileInfo src_tile_info = src_tile.tile_info();
-        const TileInfo dst_tile_info = TileInfo(target_dt, src_tile_info.height(), src_tile_info.width());
+        auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
-        // Declare dst_tile
-        auto &tile = writer->declare_tile("dst_tile", dst_tile_info);
-        dst->init_virtual_tensor(tile, sampler);
-    }
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
 
-    const auto &dst_tile = dst->tile();
+        auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    // Check if this op is cast-down or cast-up
-    const size_t src_size  = data_size_from_type(_src->data_type());
-    const size_t dst_size  = data_size_from_type(_dst->data_type());
-    const bool   cast_down = (src_size >= dst_size);
+        // Calculate coordinates
+        get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                                const_dst_shift_back_n0_i32, const_0_i32);
+        get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
 
-    if (cast_down && is_data_type_quantized(_src->data_type()))
-    {
-        const auto &constant_x80 = writer->declare_tile("0x80", 0x80);
-        writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80);
-    }
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+        ckw::DataType src_dt   = to_ckw(_src->data_type());
+        auto          tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0));
 
-    ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
+        writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
 
-    if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
-    {
-        convert_policy = ckw::ConvertPolicy::Saturate;
+        // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+        src->init_virtual_tensor(tile_src, sampler_src);
     }
 
-    writer->op_cast_expression(dst_tile, src_tile, convert_policy);
+    auto tile_src = src->tile();
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    // Only None ConvertPolicy is supported for floating-point data types
+    ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
+
+    writer->op_cast(tile_dst, tile_src, convert_policy);
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwCast::get_window() const
@@ -168,8 +243,8 @@ Window GpuCkwCast::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int     num_elems_processed_per_iteration =
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
         adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
     Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
index 1e09c78f47..cfccab186b 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,21 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
 #include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
@@ -49,22 +49,16 @@ GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId                      id
                                              const ArgumentPack<ITensorInfo> &tensors,
                                              const Attributes                &attributes,
                                              const Settings                  &settings)
-    : IGpuCkwComponentDriver{id, tensors},
-      _src{},
-      _weight{},
-      _bias{},
-      _dst{},
-      _attributes{attributes},
-      _settings{settings}
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
-    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
-    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
     if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
-        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+        _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _bias, _dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst);
 }
 
 void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup    &comp_group,
@@ -72,192 +66,294 @@ void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup    &comp_g
                                                  GpuCkwScopedKernelWriter writer) const
 {
     // Data Layout is NHWC
-    constexpr int32_t width_idx  = 1;
-    constexpr int32_t height_idx = 2;
+    const uint32_t width_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
 
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
+
+    if (using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+    }
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt           = to_ckw(_dst->data_type());
+    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_w            = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h            = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_h            = static_cast<int32_t>(_dst->dimension(height_idx));
+    const auto stride_x         = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y         = static_cast<int32_t>(_attributes.stride().y());
+    const auto pad_x            = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y            = static_cast<int32_t>(_attributes.pad().top);
+    const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier());
+    const auto dilation_x       = static_cast<int32_t>(_attributes.dilation().x());
+    const auto dilation_y       = static_cast<int32_t>(_attributes.dilation().y());
+    const auto kernel_size      = kernel_width * kernel_height;
+
+    // CKW constants
+    auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+    auto const_kernel_size_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+    auto const_dst_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_src_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_0_i32        = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_neg_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+    auto const_depth_multiplier_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32));
+    auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32));
+    auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32));
+    auto const_0_fp           = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    // Tunable parameters
-    // Currently only m0 and n0 = 1 are supported.
-    const int32_t     m0       = root_window.y().step();
-    const int32_t     n0       = root_window.x().step();
-    constexpr int32_t m0_a_val = 1;
-    constexpr int32_t n0_a_val = 1;
-    constexpr int32_t m0_b_val = 1;
-
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *wei =
-        vtable.declare_variable(comp_group, writer, _weight, TensorStorageType::ClBufferUint8Ptr, "wei");
-    GpuCkwComponentArgument *bia = nullptr;
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    const int32_t src_m0 = kernel_width + (dst_m0 - 1);
+    const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0;
+    const int32_t wei_m0 = kernel_width;
+    const int32_t wei_n0 = dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
 
-    if (_bias && _bias->has_valid_id())
+    if (dst_m0_partial == 0)
     {
-        bia = vtable.declare_variable(comp_group, writer, _bias, TensorStorageType::ClBufferUint8Ptr, "bia");
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
     }
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    // Constants
-    const auto &const_1    = writer->declare_tile("1", 1);
-    const auto &wei_height = writer->declare_tile("WEI_HEIGHT", static_cast<int32_t>(_weight->dimension(height_idx)));
-    const auto &wei_width  = writer->declare_tile("WEI_WIDTH", static_cast<int32_t>(_weight->dimension(width_idx)));
-    const auto &dst_height = writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
-    const auto &stride_x   = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
-    const auto &stride_y   = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
-    const auto &pad_left   = writer->declare_tile("PAD_LEFT", static_cast<int32_t>(_attributes.pad().left));
-    const auto &pad_top    = writer->declare_tile("PAD_TOP", static_cast<int32_t>(_attributes.pad().top));
-    const auto &depth_multiplier =
-        writer->declare_tile("DEPTH_MULTIPLIER", static_cast<int32_t>(_attributes.depth_multiplier()));
-    auto &const_0 = writer->declare_tile("0", 0);
-    auto &yo      = writer->declare_tile("yo", ckw::DataType::Int32);
-
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &bout = writer->declare_tile("bout", ckw::DataType::Int32);
-    writer->op_binary_expression(bout, gid_2, ckw::BinaryOp::Div, dst_height); // gid_2 / h
-    writer->op_binary_expression(yo, gid_2, ckw::BinaryOp::Mod, dst_height);   // gid_2 % h
-
-    const int32_t dst_partial_n0_v = _dst->tensor_shape()[0] % n0;
-    const int32_t dst_partial_m0_v = _dst->tensor_shape()[1] % m0;
-    auto         &g_ind_0          = writer->declare_tile("g_ind_0", ckw::DataType::Int32);
-    auto         &g_ind_1          = writer->declare_tile("g_ind_1", ckw::DataType::Int32);
-    get_coord(writer, g_ind_0, gid_0, n0, dst_partial_n0_v, "dst_x_", const_0);
-    get_coord(writer, g_ind_1, gid_1, m0, dst_partial_m0_v, "dst_y_", const_0);
-
-    TensorTileSampler src_sampler;
-    src_sampler.width(m0_a_val);
-    src_sampler.height(n0_a_val);
-    src_sampler.format(TensorSamplerFormat::C_W_H);
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    src_sampler.address_mode_y(TensorSamplerAddressModeY::Skip);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-
-    TensorTileSampler wei_sampler;
-    wei_sampler.width(m0_b_val);
-    wei_sampler.height(n0);
-    wei_sampler.format(TensorSamplerFormat::C_W_H);
-    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::Skip);
-    dst_sampler.x(g_ind_0);
-    dst_sampler.y(g_ind_1);
-    dst_sampler.z(yo);
-    dst_sampler.b(bout);
-
-    if (!dst->has_tile())
+    else
     {
-        auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
-        dst->init_virtual_tensor(dst_tile, dst_sampler);
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
     }
-    auto &dst_tile = dst->tile();
 
-    writer->op_assign(dst_tile, const_0);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize the destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
 
-    auto &xi = writer->declare_tile("xi", ckw::DataType::Int32);
-    writer->op_binary_expression(xi, g_ind_1, ckw::BinaryOp::Mul, stride_x);
-    writer->op_binary_expression(xi, xi, ckw::BinaryOp::Sub, pad_left);
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensors
+     ********************************************************************************/
+    // SOURCE SAMPLER
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // WEIGHTS SAMPLER
+    // We cannot have out-of-bounds accesses for the weights
+    ckw::TensorSampler sampler_wei;
+    sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    if (_settings.export_weights_to_cl_image())
+    {
+        sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+    }
+    else
+    {
+        sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
 
-    auto &yi = writer->declare_tile("yi", ckw::DataType::Int32);
-    writer->op_binary_expression(yi, yo, ckw::BinaryOp::Mul, stride_y);
-    writer->op_binary_expression(yi, yi, ckw::BinaryOp::Sub, pad_top);
+    // BIAS SAMPLER
+    ckw::TensorSampler sampler_bia;
+    sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+    sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
 
-    auto &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
-    writer->op_binary_expression(a_x, g_ind_0, BinaryOp::Div, depth_multiplier);
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (Optional)
+     ********************************************************************************/
+    // Not required
 
-    // src_tile
-    auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), m0_a_val, n0_a_val));
-    // wei_tile
-    auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_weight->data_type()), m0_b_val, n0));
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+    writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32);
+    writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32);
+
+    auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32);
+    writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+    writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32);
+
+    auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32);
+    writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+    writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32);
 
     // Loop variables
-    auto &yk = writer->declare_tile("yk", ckw::DataType::Int32);
-    auto &xk = writer->declare_tile("xk", ckw::DataType::Int32);
-
-    // Because 1x1 blocks are being used here, scalar values are being loaded from memory instead of using tiles, since tile vector access currently is not available. Hence the values are loaded in the inner loop.
-    // This loop will be reworked.
-    writer->op_assign(yk, const_0);
-    writer->op_for_loop(yk, BinaryOp::Less, wei_height, yk, AssignmentOp::Increment, const_1,
-                        [&]()
-                        {
-                            // xk = 0
-                            writer->op_assign(xk, const_0);
-                            writer->op_for_loop(
-                                xk, BinaryOp::Less, wei_width, xk, AssignmentOp::Increment, const_1,
-                                [&]()
-                                {
-                                    writer->op_assign(b, const_0);
-                                    writer->op_assign(a, const_0);
-
-                                    // src_tile loading
-                                    auto &xi_curr = writer->declare_tile("xi_curr", ckw::DataType::Int32);
-                                    writer->op_binary_expression(xi_curr, xi, BinaryOp::Add, xk);
-                                    auto &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
-                                    writer->op_binary_expression(a_y, yi, BinaryOp::Add, yk);
-                                    src_sampler.x(a_x);
-                                    src_sampler.y(xi_curr);
-                                    src_sampler.z(a_y);
-                                    src_sampler.b(bout);
-                                    writer->op_load(a, src->tensor(), src_sampler);
-
-                                    // wei_tile loading
-                                    auto &b_y = writer->declare_tile("b_y", ckw::DataType::Int32);
-                                    writer->op_binary_expression(b_y, wei_width, BinaryOp::Mul, yk);
-                                    writer->op_binary_expression(b_y, b_y, BinaryOp::Add, xk);
-                                    wei_sampler.x(g_ind_0);
-                                    wei_sampler.y(b_y);
-                                    wei_sampler.z(const_0);
-                                    wei_sampler.b(const_0);
-                                    writer->op_load(b, wei->tensor(), wei_sampler);
-
-                                    // Do the accumulation
-                                    auto &mul_result = writer->declare_tile("mul_results", a.data_type());
-                                    writer->op_binary_expression(mul_result, a, BinaryOp::Mul, b);
-                                    writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, mul_result);
-                                });
-                        });
-
-    // Add Bias
-    if (_bias && _bias->has_valid_id())
+    auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32);
+
+    writer->op_assign(tile_yk, const_0_i32);
+
+    // clang-format off
+    writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32,
+    [&]()
     {
-        TensorTileSampler bias_sampler;
-        bias_sampler.width(n0);
-        bias_sampler.height(1);
-        bias_sampler.format(TensorSamplerFormat::C_W_H);
-        bias_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-        bias_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-        bias_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-        bias_sampler.x(g_ind_0);
-        bias_sampler.y(const_0);
-        bias_sampler.z(const_0);
-        bias_sampler.b(const_0);
-
-        auto &bias_tile = writer->declare_tile("bias_tile", ckw::TileInfo(to_ckw(_bias->data_type()), 1, n0));
-        writer->op_load(bias_tile, bia->tensor(), bias_sampler);
-        writer->op_binary_expression(dst_tile, dst_tile, BinaryOp::Add, bias_tile);
+        auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0));
+        auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0));
+
+        writer->op_assign(tile_src, const_0_fp);
+
+        auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_x_lt_w  = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_lt_h  = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32));
+
+        // Check if yi + yk * DILATION_Y is out-of-bound
+        writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32);
+        writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32);
+
+        auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32));
+
+        // Load src
+        for(int32_t xk = 0; xk < src_m0; ++xk)
+        {
+            auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32));
+
+            // xi + xk * DILATION_X
+            writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32);
+            writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi);
+
+            // Check if xi + xk * DILATION_X is out-of-bound
+            writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32);
+            writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32);
+
+            // Set mi to -1 if we have out-of-bound memory accesses
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0);
+            writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h);
+
+            writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0);
+        }
+
+        // Load wei
+        writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32);
+
+        // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW
+        // Therefore, this part should be replaced with the MAC ternary operator when availabe
+        auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+        for(int32_t m0 = 0; m0 < dst_m0; ++m0)
+        {
+            for(int32_t xk = 0; xk < kernel_width; ++xk)
+            {
+                auto tile_a = tile_src.row(m0 + xk);
+                auto tile_b = tile_wei.row(xk);
+                auto tile_c = tile_dst.row(m0);
+
+                writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b);
+                writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp);
+            }
+        }
+        writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32);
+    });
+    // clang-format on
+
+    // Bias addition
+    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+    if (using_bias)
+    {
+        if (!bia->has_tile())
+        {
+            auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+            writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32);
+            bia->init_virtual_tensor(tile_bia, sampler_bia);
+        }
+        auto &tile_bia = bia->tile();
+
+        writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
     }
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwDepthwiseConv2d::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     TensorShape output_shape = _dst->tensor_shape();
-    // Currently only m0 and n0 = 1 are supported.
-    Window win = calculate_max_window(output_shape, Steps(1U, 1U));
+
+    Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0()));
     return win.collapse(win, Window::DimZ);
 }
 } // namespace dynamic_fusion
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
index f9bcaabaec..a15d3ee710 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -67,8 +68,8 @@ public:
 
 private:
     const ITensorInfo *_src;
-    const ITensorInfo *_weight;
-    const ITensorInfo *_bias;
+    const ITensorInfo *_wei;
+    const ITensorInfo *_bia;
     const ITensorInfo *_dst;
     Attributes         _attributes;
     Settings           _settings;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
index 7833da2334..eb4f644eb6 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,18 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/TileInfo.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+#include <vector>
 
 namespace arm_compute
 {
@@ -47,7 +46,7 @@ namespace experimental
 namespace dynamic_fusion
 {
 
-using TileContainer = std::vector<std::vector<std::string>>;
+using TileContainer = std::vector<std::vector<int32_t>>;
 
 GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
                                        const ArgumentPack<ITensorInfo> &tensors,
@@ -70,20 +69,126 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_grou
     ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
                              "Only the weights tensor can be exported to cl_image");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
-    const unsigned int width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+    const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const uint32_t width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
 
+    if (using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+    }
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt        = to_ckw(_dst->data_type());
+    const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width  = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_c         = static_cast<int32_t>(_src->dimension(channel_idx));
+    const auto src_w         = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h         = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_w         = static_cast<int32_t>(_dst->dimension(width_idx));
+    const auto stride_x      = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y      = static_cast<int32_t>(_attributes.stride().y());
+    const auto pad_x         = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y         = static_cast<int32_t>(_attributes.pad().top);
+    const auto kernel_size   = kernel_width * kernel_height;
+    const auto k0 =
+        static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)));
+
+    // CKW constants
+    auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+    auto const_kernel_size_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+    auto const_src_c_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32));
+    auto const_src_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32    = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_w_i32    = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32));
+    auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32    = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_k0_i32       = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+    auto const_0_i32        = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_neg_1_i32    = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+    auto const_0_fp         = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_src_c_i32_minus_k0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    // Tunable parameters
-    const int32_t m0         = root_window.y().step();
-    const int32_t n0         = root_window.x().step();
-    const int32_t k0         = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
-    const int32_t partial_n0 = _dst->dimension(0) % n0;
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
+
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
 
-    const int32_t K = _src->dimension(channel_idx);
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
 
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
     // Exporting the weights tensor to an OpenCL image object is currently only supported when:
     //   a) k0 is equal to 4
     // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
@@ -92,143 +197,123 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_grou
     // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
     // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
     // extra loop to compute the left-over elements.
-    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
-
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *wei = vtable.declare_variable(
-        comp_group, writer, _wei,
-        use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-    GpuCkwComponentArgument *bia = nullptr;
+    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0);
+
+    // SOURCE SAMPLER
+    // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+    //   compute left-over elements
+    // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the
+    //   indirection buffer mi does not contain negative values representing out-of-bounds reads.
+    auto address_mode_y_src =
+        kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero;
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(address_mode_y_src);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // WEIGHTS SAMPLER
+    // We cannot have out-of-bounds accesses for the weights
+    ckw::TensorSampler sampler_wei;
+    sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+    sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    if (use_cl_image_for_weights)
+    {
+        sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+    }
+    else
+    {
+        sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+    }
 
-    const bool using_bias = _bia != nullptr;
+    // BIAS SAMPLER
+    ckw::TensorSampler sampler_bia;
 
     if (using_bias)
     {
-        bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
+        sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+        sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
     }
 
-    // Constants
-    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
-    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
-    const auto src_channels     = static_cast<int32_t>(_src->dimension(channel_idx));
-    auto      &tile_kernel_w    = writer->declare_tile("kernel_w", kernel_width);
-    auto      &tile_kernel_size = writer->declare_tile("kernel_size", kernel_width * kernel_height);
-    auto      &tile_src_c       = writer->declare_tile("src_c", static_cast<int32_t>(_src->dimension(channel_idx)));
-    auto      &tile_dst_w       = writer->declare_tile("dst_w", static_cast<int32_t>(_dst->dimension(width_idx)));
-    auto      &tile_stride_x    = writer->declare_tile("stride_x", static_cast<int32_t>(_attributes.stride().x()));
-    auto      &tile_stride_y    = writer->declare_tile("stride_y", static_cast<int32_t>(_attributes.stride().y()));
-    auto      &tile_pad_x       = writer->declare_tile("pad_x", static_cast<int32_t>(_attributes.pad().left));
-    auto      &tile_pad_y       = writer->declare_tile("pad_y", static_cast<int32_t>(_attributes.pad().top));
-    auto      &tile_k0          = writer->declare_tile("k0", k0);
-    auto      &tile_0           = writer->declare_tile("0", 0);
-    auto      &tile_1           = writer->declare_tile("1", 1);
-
-    auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
     writer->op_get_global_id(tile_gid_0, 0);
     writer->op_get_global_id(tile_gid_1, 1);
     writer->op_get_global_id(tile_gid_2, 2);
 
-    auto &tile_cout = writer->declare_tile("cout", ckw::DataType::Int32); // OFM
-    auto &tile_mout = writer->declare_tile("mout", ckw::DataType::Int32); // WIDTH x HEIGHT
-    auto &tile_bout = writer->declare_tile("bout", ckw::DataType::Int32); // BATCH SIZE IDX
-
-    // Get the boundary aware coordinates at each global dimension index
-    get_coord(writer, tile_cout, tile_gid_0, n0, partial_n0, tile_cout.name() + "_dim0_", tile_0);
-    get_coord(writer, tile_mout, tile_gid_1, m0, 0, tile_mout.name() + "_dim1_", tile_0);
-    get_coord(writer, tile_bout, tile_gid_2, 1, 0, tile_bout.name() + "_dim2_", tile_0);
-
-    TensorTileSampler src_sampler;
-    src_sampler.width(k0);
-    src_sampler.height(m0);
-    src_sampler.format(TensorSamplerFormat::C_WH_1);
-    // We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
-    // compute left-over elements
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
-    // indirection buffer mi does not contain negative values representing out-of-bounds reads.
-    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None
-                                                  : TensorSamplerAddressModeY::SkipMinEdgeOnly);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler wei_sampler;
-    wei_sampler.width(k0);
-    wei_sampler.height(n0);
-    wei_sampler.format(TensorSamplerFormat::C_WH_1);
-    // We cannot have out-of-bounds accesses for the weights
-    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_WH_1);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::ClampToMaxEdgeOnly);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    dst_sampler.x(tile_cout);
-    dst_sampler.y(tile_mout);
-    dst_sampler.z(tile_0);
-    dst_sampler.b(tile_bout);
-
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
-        dst->init_virtual_tensor(tile, dst_sampler);
-    }
-    auto &tile_dst = dst->tile();
+    auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT
+    auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    writer->op_assign(tile_dst, tile_0);
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32);
+    get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32);
 
-    // We create a 2d container of size (M0, 1) to store the indices for iteration
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    // We create a 2d container of size (dst_m0, 1) to store the indices for iteration
     TileContainer it;
-    for (int m = 0; m < m0; ++m)
+    for (int32_t m = 0; m < dst_m0; ++m)
     {
-        std::vector<std::string> idx{std::to_string(m)};
+        std::vector<int32_t> idx{m};
         it.push_back({idx});
     }
-    const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
 
-    auto &tile_xi = writer->declare_tile("xi", TileInfo(ckw::DataType::Int32, m0, 1));
-    auto &tile_yi = writer->declare_tile("yi", TileInfo(ckw::DataType::Int32, m0, 1));
+    const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32));
+
+    auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+    auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
 
     // Convert the linear index to coordinate
     // xi = ((mout + i) % dst_w) * stride_x - pad_x
     // yi = ((mout + i) / dst_w) * stride_y - pad_y
-    writer->op_binary_expression(tile_xi, tile_mout, BinaryOp::Add, tile_it);
-    writer->op_binary_expression(tile_yi, tile_mout, BinaryOp::Add, tile_it);
-    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mod, tile_dst_w);
-    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Div, tile_dst_w);
-    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mul, tile_stride_x);
-    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Mul, tile_stride_y);
-    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Sub, tile_pad_x);
-    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Sub, tile_pad_y);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32);
+    writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32);
+    writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32);
 
-    auto &tile_y_b = writer->declare_tile("y_b", ckw::DataType::Int32);
-    writer->op_binary_expression(tile_y_b, tile_cout, BinaryOp::Mul, tile_kernel_size);
+    auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32));
+    writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32);
 
-    auto &tile_i = writer->declare_tile("i", ckw::DataType::Int32);
-    writer->op_assign(tile_i, tile_0);
+    auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32));
+    writer->op_assign(tile_i, const_0_i32);
 
     // clang-format off
-    writer->op_for_loop(tile_i, BinaryOp::Less, tile_kernel_size, tile_i, AssignmentOp::Increment, tile_1, [&]()
+    writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
     {
-        auto &tile_x_k = writer->declare_tile("x_k", ckw::DataType::Int32);
-        auto &tile_y_k = writer->declare_tile("y_k", ckw::DataType::Int32);
+        auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32));
 
-        writer->op_binary_expression(tile_x_k, tile_i, BinaryOp::Mod, tile_kernel_w);
-        writer->op_binary_expression(tile_y_k, tile_i, BinaryOp::Div, tile_kernel_w);
+        writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32);
+        writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32);
 
-        auto &tile_ck = writer->declare_tile("ck", ckw::DataType::Int32);
-        writer->op_assign(tile_ck, tile_0);
+        auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32));
+        writer->op_assign(tile_ck, const_0_i32);
 
-        auto &tile_mi = writer->declare_tile("mi", TileInfo(ckw::DataType::Int32, m0, 1));
         // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
         // x_s = xi + x_k
         // y_s = yi + y_k
@@ -237,68 +322,78 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_grou
         // mi = select(-1, mi, x_s < width);
         // mi = select(-1, mi, y_s >= 0);
         // mi = select(-1, mi, y_s < height);
-        writer->util_get_indirect_buffer(tile_mi, src->tensor(), src_sampler, tile_xi, tile_yi, tile_x_k, tile_y_k);
-
-        src_sampler.x(tile_ck);
-        src_sampler.y(tile_mi);
-        src_sampler.z(tile_0);
-        src_sampler.b(tile_bout);
-
-        wei_sampler.x(tile_ck);
-        wei_sampler.y(tile_y_b);
-        wei_sampler.z(tile_0);
-        wei_sampler.b(tile_0);
-
-        auto &tile_src_c_minus_k0 = writer->declare_tile("src_c_minus_k0", src_channels - k0);
-
-        writer->op_for_loop(tile_ck, BinaryOp::LessEqual, tile_src_c_minus_k0, tile_ck, AssignmentOp::Increment, tile_k0, [&]()
+        auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+        auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_xs_lt_w  = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+        auto tile_ys_lt_h  = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+        writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k);
+        writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k);
+        writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32);
+        writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs);
+        writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32);
+        writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32);
+        writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32);
+        writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w);
+        writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h);
+
+        writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]()
         {
-            auto &tile_lhs = writer->declare_tile("lhs", TileInfo(to_ckw(_src->data_type()), m0, k0));
-            auto &tile_rhs = writer->declare_tile("rhs", TileInfo(to_ckw(_wei->data_type()), n0, k0));
-            writer->op_assign(tile_lhs, tile_0);
-            writer->op_assign(tile_rhs, tile_0);
+            auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0));
+            auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0));
+            writer->op_assign(tile_lhs, const_0_fp);
+            writer->op_assign(tile_rhs, const_0_fp);
 
-            writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
-            writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+            writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+            writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
 
-            writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+            writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
         });
 
         // Left-over accumulations for when K is not a multiple of k0
-        if(!(K % k0 == 0))
+        if(((src_c % k0) != 0))
         {
-            writer->op_for_loop(tile_ck, BinaryOp::Less, tile_src_c, tile_ck, AssignmentOp::Increment, tile_1, [&]()
+            writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
             {
-                auto &tile_lhs = writer->declare_tile("lhs_leftover", TileInfo(to_ckw(_src->data_type()), m0, 1));
-                auto &tile_rhs = writer->declare_tile("rhs_leftover", TileInfo(to_ckw(_wei->data_type()), n0, 1));
-                writer->op_assign(tile_lhs, tile_0);
-                writer->op_assign(tile_rhs, tile_0);
+                auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1));
+                auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1));
+                writer->op_assign(tile_lhs, const_0_fp);
+                writer->op_assign(tile_rhs, const_0_fp);
 
-                writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
-                writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+                writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+                writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
 
-                writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+                writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
             });
         }
 
-    writer->op_binary_expression(tile_y_b, tile_y_b, BinaryOp::Add, tile_1);
+        writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32);
     });
     // clang-format on
 
-    // Bias addition
-    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of
     // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
     if (using_bias)
     {
         if (!bia->has_tile())
         {
-            // Reuse the destination sampler for the bias
-            writer->op_load_once(bia, dst_sampler);
+            auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+            writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32);
+            bia->init_virtual_tensor(tile_bia, sampler_bia);
         }
         auto &tile_bia = bia->tile();
 
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_bia);
+        writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
     }
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwDirectConv2d::get_window() const
@@ -308,13 +403,13 @@ Window GpuCkwDirectConv2d::get_window() const
     const auto dst_shape = _dst->tensor_shape();
     const auto desc      = _settings.direct_conv_descriptor();
 
-    const unsigned int n0 = adjust_vec_size(desc.n0, dst_shape[0]);
-    const unsigned int m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+    const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+    const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
 
-    Window win = calculate_max_window(dst_shape, Steps(n0, m0));
+    Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0));
 
-    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], m0);
-    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
+    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0));
     win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
 
     return win;
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
index ac32d2df32..139cf620e2 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -37,7 +38,7 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-class GpuCkwDirectConv2d final : public IGpuCkwComponentDriver
+class GpuCkwDirectConv2d : public IGpuCkwComponentDriver
 {
 public:
     using Attributes = ClComponentDirectConv2d::Attributes;
@@ -57,9 +58,7 @@ public:
                        const ArgumentPack<ITensorInfo> &tensors,
                        const Attributes                &attributes,
                        const Settings                  &settings);
-
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
-
     /** Destructor */
     ~GpuCkwDirectConv2d() override = default;
 
@@ -76,11 +75,11 @@ private:
     const ITensorInfo *_bia;
     const ITensorInfo *_dst;
 
-    const Attributes _attributes;
-    const Settings   _settings;
+    Attributes _attributes;
+    Settings   _settings;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index 2935ba45ea..fb55acad53 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,11 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
-#include "ckw/types/TensorSamplerTypes.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
@@ -42,10 +39,12 @@
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
-#include <algorithm>
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/ConstantData.h"
+#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h"
+#include <cstdint>
 #include <string>
 
-using namespace ckw;
 namespace arm_compute
 {
 namespace experimental
@@ -67,67 +66,339 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup    &comp
                                                    GpuCkwVariableTable     &vtable,
                                                    GpuCkwScopedKernelWriter writer) const
 {
-    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const auto n0          = static_cast<int32_t>(root_window.x().step());
-    const auto m0          = static_cast<int32_t>(root_window.y().step());
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *lhs =
-        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs =
-        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
 
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    // CKW constants
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
 
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The compute block parameters depend on the employed tensor format
 
-    auto &const_0 = writer->declare_tile("0", 0);
+    // Destination compute block size
+    int32_t dst_n0 = -1;
+    int32_t dst_m0 = -1;
 
-    // Load the LHS and RHS tiles
-    if (!lhs->has_tile())
+    // Destination compute block size left-over
+    int32_t dst_n0_partial = -1;
+    int32_t dst_m0_partial = -1;
+
+    if (!dst->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1),
-                                                        n0, m0, "lhs_", const_0);
-        sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-        sampler.z(const_0);
-        sampler.b(gid_2);
-        writer->op_load_once(lhs, sampler);
+        // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+        // as tensor format
+        const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+        dst_n0         = root_window.x().step();
+        dst_m0         = root_window.y().step();
+        dst_n0_partial = _dst->dimension(0) % dst_n0;
+        dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+        ckw::TensorSampler sampler_dst;
+        sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+        if (dst_n0_partial == 0)
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+        }
+
+        if (dst_m0_partial == 0)
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+        }
+        else
+        {
+            sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+        }
+        sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+        sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+        // Declare destination tile
+        ckw::DataType dst_dt   = to_ckw(_dst->data_type());
+        auto          tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+        // Bind tile to the tensor
+        dst->init_virtual_tensor(tile_dst, sampler_dst);
     }
-    if (!rhs->has_tile())
+    else
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1),
-                                                        n0, m0, "rhs_", const_0);
-        sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-        sampler.z(const_0);
-        sampler.b(gid_2);
-        writer->op_load_once(rhs, sampler);
+        // Change dst_n0 and dst_m0 if NOT root component!
+        dst_n0 = dst->tile().tile_info().width();
+        dst_m0 = dst->tile().tile_info().height();
+
+        // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial
+        // because if we enter this condition it means that the element-wise op is not the
+        // root component and the address modes have been already set.
     }
 
-    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1),
-                                                        n0, m0, "dst_", const_0);
-    dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
-    dst_sampler.z(const_0);
-    dst_sampler.b(gid_2);
+    const auto &tile_dst = dst->tile();
 
-    // Prepare the output tile.
-    if (!dst->has_tile())
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // ...
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // Check whether the lhs tensor is a tile or tensor
+    // If it is a tile, create a sampler and load the content in a tile
+    if (!lhs->has_tile())
     {
-        auto &tile = writer->declare_tile(
-            "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
-        dst->init_virtual_tensor(tile, dst_sampler);
+        // Sampler
+        ckw::TensorSampler sampler_lhs = dst->tensor_sampler();
+
+        bool broadcast_x = false;
+        bool broadcast_y = false;
+
+        int32_t lhs_n0 = dst_n0;
+        int32_t lhs_m0 = dst_m0;
+
+        // Check whether we have broadcasting
+        // In case of broadcast, lhs can only be a vector or scalar.
+        // Broadcasting in other dimensions is not supported
+        if (_dst->dimension(0) != _lhs->dimension(0))
+        {
+            broadcast_x = true;
+            lhs_n0      = 1;
+        }
+
+        if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2))
+            {
+                broadcast_y = true;
+                lhs_m0      = 1;
+            }
+        }
+        else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            if (_dst->dimension(1) != _lhs->dimension(1))
+            {
+                broadcast_y = true;
+                lhs_m0      = 1;
+            }
+        }
+
+        const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0;
+        const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0;
+
+        // Constants
+        auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32));
+        auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32));
+        auto const_lhs_shift_back_n0_i32 =
+            writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32));
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        if (!broadcast_x)
+        {
+            get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32,
+                                                    const_lhs_shift_back_n0_i32, const_0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_cout0, const_0_i32);
+        }
+
+        if (!broadcast_y)
+        {
+            get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_mout0, const_0_i32);
+        }
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            // For tile_mout1 and tile_bout0 the step can only be 1
+            if (!broadcast_y)
+            {
+                writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+            }
+            else
+            {
+                // If broadcast_y == true, it means that we have either a scalar or vector
+                // because broadcasting in other dimensions is not supported
+                writer->op_assign(tile_mout1, const_0_i32);
+            }
+
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+        }
+
+        ckw::DataType lhs_dt   = to_ckw(_lhs->data_type());
+        auto          tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0));
+
+        writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement
+        lhs->init_virtual_tensor(tile_lhs, sampler_lhs);
     }
 
-    auto &lhs_tile = lhs->tile();
-    auto &rhs_tile = rhs->tile();
-    auto &dst_tile = dst->tile();
+    // Check whether the rhs tensor is a tile or tensor
+    // If it is a tile, create a sampler and load the content in a tile
+    if (!rhs->has_tile())
+    {
+        // Sampler
+        ckw::TensorSampler sampler_rhs = dst->tensor_sampler();
 
-    // Perform the operation.
-    writer->op_binary_expression(dst_tile, lhs_tile, to_ckw(_attributes), rhs_tile);
+        bool broadcast_x = false;
+        bool broadcast_y = false;
+
+        int32_t rhs_n0 = dst_n0;
+        int32_t rhs_m0 = dst_m0;
+
+        // Check whether we have broadcasting
+        // In case of broadcast, rhs can only be a vector or scalar.
+        // Broadcasting in other dimensions is not supported
+        if (_dst->dimension(0) != _rhs->dimension(0))
+        {
+            broadcast_x = true;
+            rhs_n0      = 1;
+        }
+
+        if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2))
+            {
+                broadcast_y = true;
+                rhs_m0      = 1;
+            }
+        }
+        else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            if (_dst->dimension(1) != _rhs->dimension(1))
+            {
+                broadcast_y = true;
+                rhs_m0      = 1;
+            }
+        }
+
+        const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0;
+        const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0;
+
+        // Constants
+        auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32));
+        auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32));
+        auto const_rhs_shift_back_n0_i32 =
+            writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32));
+
+        auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32));
+        auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32));
+
+        writer->op_get_global_id(tile_gid_0, 0);
+        writer->op_get_global_id(tile_gid_1, 1);
+        writer->op_get_global_id(tile_gid_2, 2);
+
+        auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+        auto tile_mout0 =
+            writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+        auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+        auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+        // Calculate coordinates
+        if (!broadcast_x)
+        {
+            get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32,
+                                                    const_rhs_shift_back_n0_i32, const_0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_cout0, const_0_i32);
+        }
+
+        if (!broadcast_y)
+        {
+            get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32);
+        }
+        else
+        {
+            writer->op_assign(tile_mout0, const_0_i32);
+        }
+
+        // Get the boundary aware coordinates at each global dimension index
+        if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+        {
+            writer->op_assign(tile_mout1, const_0_i32);
+            get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+        }
+        else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+        {
+            // For tile_mout1 and tile_bout0 the step can only be 1
+            const auto src_w       = static_cast<int32_t>(_rhs->dimension(1));
+            auto       const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+            if (!broadcast_y)
+            {
+                writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w);
+            }
+            else
+            {
+                // If broadcast_y == true, it means that we have either a scalar or vector
+                // because broadcasting in other dimensions is not supported
+                writer->op_assign(tile_mout1, const_0_i32);
+            }
+
+            writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w);
+        }
+
+        ckw::DataType rhs_dt   = to_ckw(_rhs->data_type());
+        auto          tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0));
+
+        writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+        // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement
+        rhs->init_virtual_tensor(tile_rhs, sampler_rhs);
+    }
+
+    const auto &tile_lhs = lhs->tile();
+    const auto &tile_rhs = rhs->tile();
+
+    /********************************************************************************
+     * 7 - Write the rest of the code
+     ********************************************************************************/
+    // Perform the element-wise operation
+    writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs);
+
+    ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
 }
 
 Window GpuCkwElementwiseBinary::get_window() const
@@ -138,8 +409,8 @@ Window GpuCkwElementwiseBinary::get_window() const
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl = 16;
-    const unsigned int     num_elems_processed_per_iteration =
+    constexpr uint32_t vector_size_byte_opencl = 16;
+    const uint32_t     num_elems_processed_per_iteration =
         adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
     Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
@@ -158,24 +429,6 @@ std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group)
     };
     return join(build_params, "_");
 }
-
-std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_group) const
-{
-    ARM_COMPUTE_UNUSED(comp_group);
-    /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles)
-    std::vector<std::string> build_params = {
-        "elementwise_binary",
-        "op",
-        to_string(_attributes.operation()),
-        "dt",
-        lower_string(string_from_data_type(_dst->data_type())),
-        "dst_dim0",
-        support::cpp11::to_string(_dst->dimension(0)),
-        "dst_dim1",
-        support::cpp11::to_string(_dst->dimension(1)),
-    };
-    return join(build_params, "_");
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
index 1a20d4c533..c6cbba28d3 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
 
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
@@ -56,7 +56,6 @@ public:
                                       GpuCkwScopedKernelWriter writer) const override;
     Window       get_window() const override;
     std::string  get_name(const ComponentGroup &comp_group) const override;
-    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_lhs;
@@ -68,4 +67,4 @@ private:
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
index 9beba03598..14ad3847fc 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,20 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
-using namespace ckw;
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
 namespace arm_compute
 {
 namespace experimental
@@ -59,189 +61,189 @@ void GpuCkwMatMul::write_component_code(const ComponentGroup    &comp_group,
                                         GpuCkwVariableTable     &vtable,
                                         GpuCkwScopedKernelWriter writer) const
 {
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto k =
+        _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0));
+    const auto k0     = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k));
+    const auto dst_dt = to_ckw(_dst->data_type());
+
+    // CKW constants
+    auto const_k_i32          = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32));
+    auto const_k0_i32         = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+    auto const_0_i32          = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32      = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_fp           = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
 
-    GpuCkwComponentArgument *lhs =
-        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs =
-        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    // Constants
-    const int   height_idx = get_data_layout_dimension_index(_lhs->data_layout(), DataLayoutDimension::HEIGHT);
-    const auto &rhs_h      = writer->declare_tile("rhs_h", static_cast<int32_t>(_rhs->dimension(height_idx)));
-    const int   m          = static_cast<int>(_dst->dimension(1));
-    const int   n          = static_cast<int>(_dst->dimension(0));
-    const int   k =
-        _attributes.adj_lhs() ? static_cast<int>(_lhs->tensor_shape().y()) : static_cast<int>(_lhs->tensor_shape().x());
-    const int m0               = root_window.y().step();
-    const int n0               = root_window.x().step();
-    const int k0               = _settings.k0();
-    const int partial_store_m0 = m % m0;
-    const int partial_store_n0 = n % n0;
-
-    const auto &const_1 = writer->declare_tile("1", 1);
-    auto       &const_0 = writer->declare_tile("0", 0);
-    auto       &k0_tile = writer->declare_tile("k0", k0);
-    auto       &k_tile  = writer->declare_tile("k", k);
-
-    auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    auto &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    auto &x = writer->declare_tile("x", ckw::DataType::Int32);
-    auto &y = writer->declare_tile("y", ckw::DataType::Int32);
-    auto &z = writer->declare_tile("z", ckw::DataType::Int32);
-
-    get_coord(writer, x, gid_0, n0, partial_store_n0, "gid_x_", const_0);
-    get_coord(writer, y, gid_1, m0, partial_store_m0, "gid_y_", const_0);
-    get_coord(writer, z, gid_2, 1, 0, "gid_z_", const_0);
-
-    TensorTileSampler lhs_sampler;
-    lhs_sampler.height(m0);
-    lhs_sampler.width(k0);
-    lhs_sampler.format(TensorSamplerFormat::C_W_H);
-    lhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    lhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    lhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler rhs_sampler;
-    rhs_sampler.height(k0);
-    rhs_sampler.width(n0);
-    rhs_sampler.format(TensorSamplerFormat::C_WH_1);
-    rhs_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    rhs_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    rhs_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    dst_sampler.x(x);
-    dst_sampler.y(y);
-    dst_sampler.z(z);
-    dst_sampler.b(const_0);
-
-    if (!dst->has_tile())
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
     {
-        auto &dst_tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), m0, n0));
-        dst->init_virtual_tensor(dst_tile, dst_sampler);
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
     }
-    auto &dst_tile = dst->tile();
 
-    // Initialize the accumulators
-    writer->op_assign(dst_tile, const_0);
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+    }
 
-    auto &rhs_z = writer->declare_tile("rhs_z", ckw::DataType::Int32);
-    writer->op_binary_expression(rhs_z, z, BinaryOp::Mul, rhs_h);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensors
+     ********************************************************************************/
+    // LHS SAMPLER
+    // The assumption here is that M is multiple of M0. This limitation will be removed once
+    // we have the support for OverlappingMin as address mode for the Y direction
+    ckw::TensorSampler sampler_lhs;
+    sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // RHS SAMPLER
+    ckw::TensorSampler sampler_rhs;
+    sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code (optional)
+     ********************************************************************************/
+
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index
+    auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index
+    auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32);
+    get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index
+
+    writer->op_assign(tile_idx_k, const_0_i32);
 
-    auto &k_i     = writer->declare_tile("k_i", ckw::DataType::Int32);
-    auto &k_limit = writer->declare_tile("k_limit", k - k0);
+    // clang-format off
+    writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32,
+    [&]()
+    {
+        auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0));
+        auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0));
+        writer->op_assign(tile_lhs, const_0_fp);
+        writer->op_assign(tile_rhs, const_0_fp);
 
-    auto &x_i = writer->declare_tile("x_i", ckw::DataType::Int32);
-    writer->op_assign(x_i, const_0);
+        writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+        writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
 
-    writer->op_assign(k_i, const_0);
+        writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
 
-    // *INDENT-OFF*
-    // clang-format off
-    writer->op_for_loop(k_i, BinaryOp::LessEqual, k_limit, k_i, AssignmentOp::Increment, k0_tile,
-        [&]()
-        {
-            //Initialize tiles
-            // lhs_tile
-            auto &a = writer->declare_tile("a", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, k0));
-            // rhs_tile
-            auto &b = writer->declare_tile("b", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, k0));
-            writer->op_assign(a, const_0);
-            writer->op_assign(b, const_0);
-
-            // Loading the tiles
-            // LHS
-            lhs_sampler.x(x_i);
-            lhs_sampler.y(y);
-            lhs_sampler.z(z);
-            lhs_sampler.b(const_0);
-            writer->op_load(a, lhs->tensor(), lhs_sampler);
-
-            // RHS
-            auto &y_i = writer->declare_tile("y_i", ckw::DataType::Int32);
-            writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
-            rhs_sampler.x(k_i);
-            rhs_sampler.y(y_i);
-            rhs_sampler.z(const_0);
-            rhs_sampler.b(const_0);
-            writer->op_load(b, rhs->tensor(), rhs_sampler);
-
-            // Perform Matmul
-            writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
-            writer->op_binary_expression(x_i, x_i, BinaryOp::Add, k0_tile);
-        });
-// *INDENT-ON*
-    // clang-format on
+    });
 
-    // Handling leftovers
-    if (k % k0 != 0)
+    // Left-over accumulations for when K is not a multiple of k0
+    if(((k % k0) != 0))
     {
-        // *INDENT-OFF*
-        // clang-format off
-        writer->op_for_loop(k_i, BinaryOp::Less, k_tile, k_i, AssignmentOp::Increment, const_1,
-            [&]()
-            {
-                //Initialize tiles
-                // lhs_tile
-                auto &a =
-                    writer->declare_tile("a_leftover", ckw::TileInfo(to_ckw(_lhs->data_type()), m0, 1));
-                // rhs_tile
-                auto &b =
-                    writer->declare_tile("b_leftover", ckw::TileInfo(to_ckw(_rhs->data_type()), n0, 1));
-                writer->op_assign(a, const_0);
-                writer->op_assign(b, const_0);
-
-                // Loading the tiles
-                // LHS
-                lhs_sampler.x(x_i);
-                lhs_sampler.y(y);
-                lhs_sampler.z(z);
-                lhs_sampler.b(const_0);
-                writer->op_load(a, lhs->tensor(), lhs_sampler);
-
-                // RHS
-                auto &y_i = writer->declare_tile("y_i_leftover", ckw::DataType::Int32);
-                writer->op_binary_expression(y_i, x, BinaryOp::Add, rhs_z);
-                rhs_sampler.x(k_i);
-                rhs_sampler.y(y_i);
-                rhs_sampler.z(const_0);
-                rhs_sampler.b(const_0);
-                writer->op_load(b, rhs->tensor(), rhs_sampler);
-
-                // Perform Matmul
-                writer->op_binary_expression(dst_tile, a, BinaryOp::MatMul_Nt_T, b);
-                writer->op_binary_expression(x_i, x_i, BinaryOp::Add, const_1);
-            });
-// *INDENT-ON*
-        // clang-format on
+        writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+        {
+            auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1));
+            auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1));
+            writer->op_assign(tile_lhs, const_0_fp);
+            writer->op_assign(tile_rhs, const_0_fp);
+
+            writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+            writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+            writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+        });
     }
+    // clang-format on
 }
 
 Window GpuCkwMatMul::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
-    const int  m       = _dst->dimension(1);
-    const int  n       = _dst->dimension(0);
-    const bool adj_lhs = _attributes.adj_lhs();
+    const int32_t m       = _dst->dimension(1);
+    const int32_t n       = _dst->dimension(0);
+    const bool    adj_lhs = _attributes.adj_lhs();
 
-    int m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
-    int n0 = adjust_vec_size(_settings.n0(), n);
+    const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+    const int32_t n0 = adjust_vec_size(_settings.n0(), n);
 
     // Configure kernel window
     Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
@@ -256,9 +258,9 @@ std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const
 
     std::string kernel_name("mat_mul_native");
 
-    const int m = _dst->dimension(1);
-    const int n = _dst->dimension(0);
-    const int k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+    const int32_t m = _dst->dimension(1);
+    const int32_t n = _dst->dimension(0);
+    const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
 
     kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
     kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
index ae2ea09f05..790418bf50 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "src/core/common/Macros.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -75,8 +76,8 @@ private:
     const ITensorInfo *_rhs;
     const ITensorInfo *_dst;
 
-    const Attributes _attributes;
-    const Settings   _settings;
+    Attributes _attributes;
+    Settings   _settings;
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
index 8ab3ec3a55..d027f348ef 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,18 +26,17 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
-#include "ckw/TensorTileSampler.h"
 
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
-using namespace ckw;
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -61,272 +60,324 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup    &comp_group,
                                         GpuCkwVariableTable     &vtable,
                                         GpuCkwScopedKernelWriter writer) const
 {
-    const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const unsigned int n0          = root_window.x().step();
-    const unsigned int m0          = root_window.y().step();
-
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    TileOperand &gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
-
-    writer->op_get_global_id(gid_0, 0);
-    writer->op_get_global_id(gid_1, 1);
-    writer->op_get_global_id(gid_2, 2);
-
-    // Data Layout is NHWC
-    constexpr int width_idx  = 1;
-    constexpr int height_idx = 2;
-
-    const int32_t pool_size_x   = static_cast<int32_t>(_attributes.pool_size().x());
-    const int32_t pool_size_y   = static_cast<int32_t>(_attributes.pool_size().y());
-    const int32_t pad_x         = static_cast<int32_t>(_attributes.pad().left);
-    const int32_t pad_y         = static_cast<int32_t>(_attributes.pad().top);
-    const int32_t src_width     = static_cast<int32_t>(_src->dimension(width_idx));
-    const int32_t src_height    = static_cast<int32_t>(_src->dimension(height_idx));
-    const auto    src_data_type = _src->data_type();
-
-    // Check if this is global pooling path
-    const bool is_global_pooling =
-        (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
-    // Check if this a case of FP_MIXED_PRECISION
-    const bool use_fp_mixed_precision =
-        (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
-
-    TileOperand       &const_0            = writer->declare_tile("0", 0);
-    const TileOperand &const_1            = writer->declare_tile("1", 1);
-    const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits<float>::lowest());
-    const TileOperand &pool_size_x_tile   = writer->declare_tile("POOL_SIZE_X", pool_size_x);
-    const TileOperand &pool_size_y_tile   = writer->declare_tile("POOL_SIZE_Y", pool_size_y);
-    const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
-    const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
-    const TileOperand &pad_x_tile    = writer->declare_tile("PAD_X", pad_x);
-    const TileOperand &pad_y_tile    = writer->declare_tile("PAD_Y", pad_y);
-    const TileOperand &dst_height_tile =
-        writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
-    const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height);
-    const TileOperand &src_width_tile  = writer->declare_tile("SRC_WIDTH", src_width);
-
-    TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32);
-    TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32);
-    TileOperand &idx_out_w = writer->declare_tile("idx_out_w", ckw::DataType::Int32);
-    TileOperand &idx_out_c = writer->declare_tile("idx_out_c", ckw::DataType::Int32);
-
-    const int32_t dst_partial_n0_v = _dst->tensor_shape()[0] % n0;
-
-    get_coord(writer, idx_out_c, gid_0, n0, dst_partial_n0_v, "dst_x_", const_0);
-    get_coord(writer, idx_out_w, gid_1, 1, 0, "dst_y_", const_0);
-
-    writer->op_binary_expression(idx_out_h, gid_2, BinaryOp::Mod, dst_height_tile); // gid_2 % h
-    writer->op_binary_expression(idx_out_n, gid_2, BinaryOp::Div, dst_height_tile); // gid_2 / h
-
-    TensorTileSampler src_sampler;
-    src_sampler.width(n0);
-    src_sampler.height(m0);
-    src_sampler.format(TensorSamplerFormat::C_W_H);
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    src_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    src_sampler.x(idx_out_c);
-    src_sampler.b(idx_out_n);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.width(n0);
-    dst_sampler.height(m0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-    dst_sampler.x(idx_out_c);
-    dst_sampler.y(idx_out_w);
-    dst_sampler.z(idx_out_h);
-    dst_sampler.b(idx_out_n);
-
-    // Prepare dst tensor and tile
-    TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0);
-    if (!dst->has_tile())
+    const uint32_t width_idx  = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_dt    = to_ckw(_dst->data_type());
+    const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x());
+    const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y());
+    const auto pad_x     = static_cast<int32_t>(_attributes.pad().left);
+    const auto pad_y     = static_cast<int32_t>(_attributes.pad().top);
+    const auto stride_x  = static_cast<int32_t>(_attributes.stride().x());
+    const auto stride_y  = static_cast<int32_t>(_attributes.stride().y());
+    const auto src_w     = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto src_h     = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto dst_h     = static_cast<int32_t>(_dst->dimension(height_idx));
+
+    // CKW constants
+    auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32));
+    auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32));
+    auto const_pad_x_i32     = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+    auto const_pad_y_i32     = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+    auto const_stride_x_i32  = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+    auto const_stride_y_i32  = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+    auto const_src_w_i32     = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32     = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32     = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_0_i32         = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32     = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_fp          = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_lowest_val_fp =
+        writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32));
+    auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+    const int32_t dst_m0 = root_window.y().step();
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
     {
-        TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info);
-        dst->init_virtual_tensor(dst_tile, dst_sampler);
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+
+    if (dst_m0_partial == 0)
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
     }
-    const TileOperand &dst_tile = dst->tile();
 
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    // Only now we can declare the N0 and M0 as constant
+    auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_dst_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+    // Check if it is global pooling
+    const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0);
+
+    // Accumulate always in F32 if the pool type is not MAX
+    const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) ||
+                         ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX);
+
+    const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16;
+
+    const bool is_wider_acc = dst_dt != acc_dt;
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+                                            const_shift_back_dst_n0_i32, const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+    writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
     // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling.
-    const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+    auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
 
     // Initialise result tile with appropriate value
     if (_attributes.pool_type() == PoolingType::MAX)
     {
         if (_settings.use_inf_as_limit())
         {
-            TileContainer            minus_inf_tile_container;
-            std::vector<std::string> value = std::vector<std::string>(n0, "(-INFINITY)");
-            minus_inf_tile_container.push_back({value});
-            const TileOperand &minus_inf =
-                writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
-            writer->op_assign(res_tile, minus_inf);
+            writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None);
         }
         else
         {
-            writer->op_assign(res_tile, const_lowest_value);
+            writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None);
         }
     }
     else
     {
-        writer->op_assign(res_tile, const_0);
+        writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None);
     }
 
-    // idx_in_w = idx_out_w * STRIDE_X - PAD_X
-    TileOperand &idx_in_w = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
-    writer->op_binary_expression(idx_in_w, idx_out_w, BinaryOp::Mul, stride_x_tile);
-    writer->op_binary_expression(idx_in_w, idx_in_w, BinaryOp::Sub, pad_x_tile);
-
-    // idx_in_h = idx_out_h * STRIDE_Y - PAD_Y
-    TileOperand &idx_in_h = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
-    writer->op_binary_expression(idx_in_h, idx_out_h, BinaryOp::Mul, stride_y_tile);
-    writer->op_binary_expression(idx_in_h, idx_in_h, BinaryOp::Sub, pad_y_tile);
-
-    TileOperand &minus_idx_in_w = writer->declare_tile("minus_idx_in_w", ckw::DataType::Int32);
-    TileOperand &minus_idx_in_h = writer->declare_tile("minus_idx_in_h", ckw::DataType::Int32);
-
-    writer->op_unary_expression(minus_idx_in_w, UnaryOp::Negate, idx_in_w);
-    writer->op_unary_expression(minus_idx_in_h, UnaryOp::Negate, idx_in_h);
-
-    // Pooling starting/ending offsets for X dim
-    TileOperand &pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
-    TileOperand &pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
-
-    writer->op_binary_elementwise_function(pool_x_s, BinaryFunction::Max, const_0, minus_idx_in_w);
-    writer->op_binary_expression(pool_x_e, src_width_tile, BinaryOp::Add, minus_idx_in_w);
-    writer->op_binary_elementwise_function(pool_x_e, BinaryFunction::Min, pool_size_x_tile, pool_x_e);
-
-    // Pooling starting/ending offsets for Y dim
-    TileOperand &pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
-    TileOperand &pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
-
-    writer->op_binary_elementwise_function(pool_y_s, BinaryFunction::Max, const_0, minus_idx_in_h);
-    writer->op_binary_expression(pool_y_e, src_height_tile, BinaryOp::Add, minus_idx_in_h);
-    writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e);
-
-    const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
+    // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X
+    auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
+    writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+    writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32);
+
+    // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y
+    auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
+    writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+    writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32);
+
+    auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32);
+    auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32);
+
+    writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start);
+    writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start);
+
+    // int pool_x_s = max((int)0, -idx_in_w);
+    // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    // int pool_y_s = max((int)0, -idx_in_h);
+    // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+    auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
+    auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
+    auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+    auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
+
+    writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start);
+    writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start);
+    writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e);
+    writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start);
+    writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start);
+    writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e);
+
+    // #if defined(EXCLUDE_PADDING)
+    // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+    // #else // defined(EXCLUDE_PADDING)
+    // int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+    // #endif // defined(EXCLUDE_PADDING)
+    auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
     if (_attributes.exclude_padding())
     {
-        const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
-        const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
-
-        writer->op_binary_expression(y_diff, pool_y_e, BinaryOp::Sub, pool_y_s);
-        writer->op_binary_expression(x_diff, pool_x_e, BinaryOp::Sub, pool_x_s);
+        auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+        auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
 
-        writer->op_binary_expression(filter_size, y_diff, BinaryOp::Mul, x_diff);
+        writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s);
+        writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s);
+        writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff);
     }
     else
     {
-        writer->op_binary_expression(filter_size, pool_size_x_tile, BinaryOp::Mul, pool_size_y_tile);
+        writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32);
     }
 
-    const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32);
-    const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32);
+    auto tile_x = writer->declare_tile("x", ckw::DataType::Int32);
+    auto tile_y = writer->declare_tile("y", ckw::DataType::Int32);
 
     if (is_global_pooling)
     {
-        writer->op_assign(x, const_0);
-        writer->op_assign(y, const_0);
-
-        writer->op_assign(pool_y_e, pool_size_y_tile);
-        writer->op_assign(pool_x_e, pool_size_x_tile);
+        writer->op_assign(tile_y, const_0_i32);
+        writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32);
     }
     else
     {
-        writer->op_assign(x, pool_x_s);
-        writer->op_assign(y, pool_y_s);
+        writer->op_assign(tile_y, tile_pool_y_s);
     }
 
     // Y dim for-loop
     writer->op_for_loop(
-        y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1,
+        tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32,
         [&]()
         {
             // Reset the iterator for the inner loop
             if (is_global_pooling)
             {
-                writer->op_assign(x, const_0);
+                writer->op_assign(tile_x, const_0_i32);
+                writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32);
             }
             else
             {
-                writer->op_assign(x, pool_x_s);
+                writer->op_assign(tile_x, tile_pool_x_s);
             }
 
-            TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
-            writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
+            auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32);
+            writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y);
 
             // X dim for-loop
             writer->op_for_loop(
-                x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1,
+                tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32,
                 [&]()
                 {
-                    TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
-                    writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
-
-                    TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+                    auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32);
+                    writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x);
 
-                    src_sampler.y(a_x);
-                    src_sampler.z(a_y);
+                    ckw::DataType src_dt   = to_ckw(_src->data_type());
+                    auto          tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
 
                     // Load src tile
-                    if (use_fp_mixed_precision)
+                    if (is_wider_acc)
                     {
-                        TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
-                        writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
-                        writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                        auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+                        writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+                                        tile_src_coord_y, tile_bout0);
+                        writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None);
                     }
                     else
                     {
-                        writer->op_load(src_tile, src->tensor(), src_sampler);
+                        writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+                                        tile_src_coord_y, tile_bout0);
                     }
 
                     // Take the square of the input, for L2 Pooling
                     if (_attributes.pool_type() == PoolingType::L2)
                     {
-                        writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
+                        writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src);
                     }
 
                     // Perfom Pooling op
                     if (_attributes.pool_type() == PoolingType::MAX)
                     {
-                        writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
+                        writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src);
                     }
                     else
                     {
-                        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
+                        writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src);
                     }
                 });
         });
 
     if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
     {
-        // filter_size is automatically broadcasted in the operation
-        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size);
+        // Filter_size is automatically broadcasted in the operation
+        auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt));
+        writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None);
+        writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp);
     }
 
     // Take square root of the result in L2 pooling
     if (_attributes.pool_type() == PoolingType::L2)
     {
-        writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile);
+        writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res);
     }
 
-    // Store the results and do casting if FP_MIXED_PRECISION
-    if (use_fp_mixed_precision)
+    // Store the results and do casting if mixed precision
+    if (is_wider_acc)
     {
-        writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None);
+        writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None);
     }
     else
     {
-        writer->op_assign(dst_tile, res_tile);
+        writer->op_assign(tile_dst, tile_res);
     }
 }
 
@@ -334,8 +385,8 @@ Window GpuCkwPool2d::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
-    TensorShape        output_shape = _dst->tensor_shape();
-    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    TensorShape    output_shape = _dst->tensor_shape();
+    const uint32_t vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));
     win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
index f2a7d41afd..edd7ea9a38 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,22 @@
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
+#include <cstdint>
+
 namespace arm_compute
 {
 namespace experimental
@@ -45,7 +48,7 @@ namespace dynamic_fusion
 {
 namespace
 {
-constexpr unsigned int opencl_vector_size_in_bytes = 16;
+constexpr uint32_t opencl_vector_size_in_bytes = 16;
 } // namespace
 
 GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
@@ -60,65 +63,141 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_grou
                                               GpuCkwVariableTable     &vtable,
                                               GpuCkwScopedKernelWriter writer) const
 {
-    const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
-
-    const Window  root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const int32_t n0          = root_window.x().step();
-    const int32_t m0          = root_window.y().step();
-    const int32_t partial_n0  = _dst->dimension(0) % n0;
-
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    // Constants
+    const uint32_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+    const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto  dst_dt  = to_ckw(_dst->data_type());
     const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
                                                               _attributes.align_corners());
     const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
                                                               _attributes.align_corners());
-    const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
-    const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
-    const auto &tile_0       = writer->declare_tile("0", 0);
-    const auto &tile_half    = writer->declare_tile("half", 0.5f);
-    const auto &tile_1       = writer->declare_tile("1", 1);
-    const auto &tile_src_w   = writer->declare_tile("src_w", static_cast<int32_t>(_src->dimension(width_idx)));
-    const auto &tile_src_h   = writer->declare_tile("src_h", static_cast<int32_t>(_src->dimension(height_idx)));
-    const auto &tile_dst_h   = writer->declare_tile("dst_h", static_cast<int32_t>(_dst->dimension(height_idx)));
-
-    const auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    const auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    const auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    const auto  src_w   = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto  src_h   = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto  dst_h   = static_cast<int32_t>(_dst->dimension(height_idx));
+
+    // CKW constants
+    auto const_src_w_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32  = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32      = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_0_fp       = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+    auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+    auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+
+    // dst_m0 must be 1
+    ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+    sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+
+    // ....
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
     writer->op_get_global_id(tile_gid_0, 0);
     writer->op_get_global_id(tile_gid_1, 1);
     writer->op_get_global_id(tile_gid_2, 2);
 
-    auto &tile_co = writer->declare_tile("co", ckw::DataType::Int32); // OFM
-    auto &tile_xo = writer->declare_tile("xo", ckw::DataType::Int32); // WIDTH
-    auto &tile_yo = writer->declare_tile("yo", ckw::DataType::Int32); // HEIGHT
-    auto &tile_bo = writer->declare_tile("bo", ckw::DataType::Int32); // BATCH SIZE IDX
+    auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    // Get the boundary aware coordinates at each global dimension index
-    get_coord(writer, tile_co, tile_gid_0, n0, partial_n0, tile_co.name() + "_dim0_", tile_0);
-    get_coord(writer, tile_xo, tile_gid_1, 1, 0, tile_xo.name() + "_dim1_", tile_0);
-    get_coord(writer, tile_yo, tile_gid_2, 1, 0, tile_yo.name() + "_dim2_", tile_0);
-    get_coord(writer, tile_bo, tile_gid_2, 1, 0, tile_yo.name() + "_dim3_", tile_0);
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    writer->op_assign(tile_xo, tile_gid_1);
+    writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
 
-    writer->op_binary_expression(tile_yo, tile_yo, BinaryOp::Mod, tile_dst_h);
-    writer->op_binary_expression(tile_bo, tile_bo, BinaryOp::Div, tile_dst_h);
-
-    const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
-    const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
     switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
             // yi_f = (yo * scale_y)
-            writer->op_binary_expression(tile_xi_f, tile_xo, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
             break;
         case SamplingPolicy::CENTER:
         {
@@ -127,11 +206,12 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_grou
             const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
             const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
 
-            writer->op_binary_expression(tile_xo_plus_half, tile_xo, BinaryOp::Add, tile_half);
-            writer->op_binary_expression(tile_yo_plus_half, tile_yo, BinaryOp::Add, tile_half);
-
-            writer->op_binary_expression(tile_xi_f, tile_xo_plus_half, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo_plus_half, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
         }
         break;
         default:
@@ -140,63 +220,32 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_grou
 
     if (_attributes.align_corners())
     {
-        writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f);
-        writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f);
+        writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f);
+        writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f);
     }
 
     // xi0 = clamp((int)xi_f, 0, (int)src_w - 1)
     // yi0 = clamp((int)yi_f, 0, (int)src_h - 1)
-    const auto &tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
-    const auto &tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
-
-    writer->op_cast_expression(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
-    writer->op_cast_expression(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
-
-    const auto &tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
-    const auto &tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
-
-    writer->op_binary_expression(tile_src_w_minus_1, tile_src_w, BinaryOp::Sub, tile_1);
-    writer->op_binary_expression(tile_src_h_minus_1, tile_src_h, BinaryOp::Sub, tile_1);
-
-    auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
-    auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
-
-    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0,
-                                            tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0,
-                                            tile_src_h_minus_1);
-
-    TensorTileSampler src_sampler;
-    src_sampler.x(tile_co);
-    src_sampler.y(tile_xi0);
-    src_sampler.z(tile_yi0);
-    src_sampler.b(tile_bo);
-    src_sampler.height(m0);
-    src_sampler.width(n0);
-    // We guarantee to not have out-of-bounds accesses
-    src_sampler.format(TensorSamplerFormat::C_W_H);
-    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    src_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    writer->op_load_once(src, src_sampler);
-    auto &tile_src = src->tile();
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.x(tile_co);
-    dst_sampler.y(tile_xo);
-    dst_sampler.z(tile_yo);
-    dst_sampler.b(tile_bo);
-    dst_sampler.height(m0);
-    dst_sampler.width(n0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    // Do not write to the same memory location with multiple threads
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    auto &tile_dst = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
-    dst->init_virtual_tensor(tile_dst, dst_sampler);
+    auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
+    auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
+
+    writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
+
+    auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+
+    writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+    writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+    auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+
+    writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1);
+
+    auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0));
+    writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
 
     writer->op_assign(tile_dst, tile_src);
 }
@@ -208,64 +257,139 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
     const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
     const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
 
-    const Window  root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
-    const int32_t n0          = root_window.x().step();
-    const int32_t m0          = root_window.y().step();
-    const int32_t partial_n0  = _dst->dimension(0) % n0;
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    GpuCkwComponentArgument *src =
-        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst =
-        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
-
-    // Constants
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto  dst_dt  = to_ckw(_dst->data_type());
     const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
                                                               _attributes.align_corners());
     const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
                                                               _attributes.align_corners());
-    const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
-    const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
-    const auto &tile_0       = writer->declare_tile("0", 0);
-    const auto &tile_half    = writer->declare_tile("half", 0.5f);
-    const auto &tile_1       = writer->declare_tile("1", 1);
-    const auto &tile_src_w   = writer->declare_tile("src_w", static_cast<int32_t>(_src->dimension(width_idx)));
-    const auto &tile_src_h   = writer->declare_tile("src_h", static_cast<int32_t>(_src->dimension(height_idx)));
-    const auto &tile_dst_h   = writer->declare_tile("dst_h", static_cast<int32_t>(_dst->dimension(height_idx)));
-
-    const auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
-    const auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
-    const auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+    const auto  src_w   = static_cast<int32_t>(_src->dimension(width_idx));
+    const auto  src_h   = static_cast<int32_t>(_src->dimension(height_idx));
+    const auto  dst_h   = static_cast<int32_t>(_dst->dimension(height_idx));
+
+    // CKW constants
+    auto const_src_w_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+    auto const_src_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+    auto const_dst_h_i32  = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+    auto const_pos_1_i32  = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_0_i32      = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_0_fp       = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+    auto const_pos_1_fp   = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32));
+    auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+    auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+    auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    // The n0 and m0 parameters from root_window only refers to the output
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Destination compute block size
+    const int32_t dst_n0 = root_window.x().step();
+
+    // dst_m0 must be 1
+    ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+    // Destination compute block size left-over
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+    // Shift-back for the overlapping-min strategy
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    ckw::TensorSampler sampler_dst;
+    sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    if (dst_n0_partial == 0)
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    }
+    else
+    {
+        sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+    }
+    sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+    sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+    // Declare destination tile
+    auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    // Initialize destination tile
+    writer->op_assign(tile_dst, const_0_fp);
+
+    // Bind tile to the tensor
+    dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the sampler for the input tensor
+     ********************************************************************************/
+    ckw::TensorSampler sampler_src;
+    sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+    sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+    sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+    sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+
+    // ....
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
 
     writer->op_get_global_id(tile_gid_0, 0);
     writer->op_get_global_id(tile_gid_1, 1);
     writer->op_get_global_id(tile_gid_2, 2);
 
-    auto &tile_co = writer->declare_tile("co", ckw::DataType::Int32); // OFM
-    auto &tile_xo = writer->declare_tile("xo", ckw::DataType::Int32); // WIDTH
-    auto &tile_yo = writer->declare_tile("yo", ckw::DataType::Int32); // HEIGHT
-    auto &tile_bo = writer->declare_tile("bo", ckw::DataType::Int32); // BATCH SIZE IDX
+    auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+    auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+    auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
 
-    // Get the boundary aware coordinates at each global dimension index
-    get_coord(writer, tile_co, tile_gid_0, n0, partial_n0, tile_co.name() + "_dim0_", tile_0);
-    get_coord(writer, tile_xo, tile_gid_1, 1, 0, tile_xo.name() + "_dim1_", tile_0);
-    get_coord(writer, tile_yo, tile_gid_2, 1, 0, tile_yo.name() + "_dim2_", tile_0);
-    get_coord(writer, tile_bo, tile_gid_2, 1, 0, tile_yo.name() + "_dim3_", tile_0);
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    writer->op_assign(tile_xo, tile_gid_1);
+    writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+    writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
 
-    // yo = coord_dim2 % dst_h
-    // bo = coord_dim2 / dst_h
-    writer->op_binary_expression(tile_yo, tile_yo, BinaryOp::Mod, tile_dst_h);
-    writer->op_binary_expression(tile_bo, tile_bo, BinaryOp::Div, tile_dst_h);
-
-    const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
-    const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+    auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
     switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
             // yi_f = (yo * scale_y)
-            writer->op_binary_expression(tile_xi_f, tile_xo, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
             break;
         case SamplingPolicy::CENTER:
         {
@@ -273,14 +397,16 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
             // yi_f = ((yo + 0.5f) * scale_y - 0.5f)
             const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
             const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
-            writer->op_binary_expression(tile_xo_plus_half, tile_xo, BinaryOp::Add, tile_half);
-            writer->op_binary_expression(tile_yo_plus_half, tile_yo, BinaryOp::Add, tile_half);
 
-            writer->op_binary_expression(tile_xi_f, tile_xo_plus_half, BinaryOp::Mul, tile_scale_x);
-            writer->op_binary_expression(tile_yi_f, tile_yo_plus_half, BinaryOp::Mul, tile_scale_y);
+            writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+            writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+            writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
 
-            writer->op_binary_expression(tile_xi_f, tile_xi_f, BinaryOp::Sub, tile_half);
-            writer->op_binary_expression(tile_yi_f, tile_yi_f, BinaryOp::Sub, tile_half);
+            writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp);
+            writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp);
         }
         break;
         default:
@@ -289,186 +415,102 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
 
     // xi = (int)floor(xi_f);
     // yi = (int)floor(yi_f);
-    const auto &tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
-    const auto &tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
-    writer->op_unary_elementwise_function(tile_xi_f_floor, UnaryFunction::Floor, tile_xi_f);
-    writer->op_unary_elementwise_function(tile_yi_f_floor, UnaryFunction::Floor, tile_yi_f);
+    auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
+    auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
+    writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f);
+    writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f);
 
-    const auto &tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
-    const auto &tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
-    writer->op_cast_expression(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
-    writer->op_cast_expression(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
+    auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
+    auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
+    writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
 
     // xi0  = clamp(xi, 0, (int)src_w - 1);
     // yi0  = clamp(yi, 0, (int)src_h - 1);
     // xi1  = clamp(xi + 1, 0, (int)src_w - 1);
     // yi1  = clamp(yi + 1, 0, (int)src_h - 1);
-    const auto &tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
-    const auto &tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
-    writer->op_binary_expression(tile_src_w_minus_1, tile_src_w, BinaryOp::Sub, tile_1);
-    writer->op_binary_expression(tile_src_h_minus_1, tile_src_h, BinaryOp::Sub, tile_1);
-
-    const auto &tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
-    const auto &tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
-    writer->op_binary_expression(tile_xi_plus_1, tile_xi, BinaryOp::Add, tile_1);
-    writer->op_binary_expression(tile_yi_plus_1, tile_yi, BinaryOp::Add, tile_1);
-
-    auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
-    auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
-    auto &tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
-    auto &tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
-
-    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1);
-    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0,
-                                            tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0,
-                                            tile_src_h_minus_1);
-
-    TensorTileSampler in_sampler;
-    in_sampler.x(tile_co);
-    in_sampler.b(tile_bo);
-    in_sampler.height(1);
-    in_sampler.width(n0);
-    // We guarantee to not have out-of-bounds accesses
-    in_sampler.format(TensorSamplerFormat::C_W_H);
-    in_sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    in_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    in_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    TensorTileSampler in00_sampler = in_sampler;
-    in00_sampler.y(tile_xi0);
-    in00_sampler.z(tile_yi0);
-
-    TensorTileSampler in01_sampler = in_sampler;
-    in01_sampler.y(tile_xi1);
-    in01_sampler.z(tile_yi0);
-
-    TensorTileSampler in10_sampler = in_sampler;
-    in10_sampler.y(tile_xi0);
-    in10_sampler.z(tile_yi1);
-
-    TensorTileSampler in11_sampler = in_sampler;
-    in11_sampler.y(tile_xi1);
-    in11_sampler.z(tile_yi1);
-
-    auto &tile_in00 = writer->declare_tile("in00", TileInfo(to_ckw(_src->data_type()), 1, n0));
-    auto &tile_in01 = writer->declare_tile("in01", TileInfo(to_ckw(_src->data_type()), 1, n0));
-    auto &tile_in10 = writer->declare_tile("in10", TileInfo(to_ckw(_src->data_type()), 1, n0));
-    auto &tile_in11 = writer->declare_tile("in11", TileInfo(to_ckw(_src->data_type()), 1, n0));
-
-    writer->op_load(tile_in00, src->tensor(), in00_sampler);
-    writer->op_load(tile_in01, src->tensor(), in01_sampler);
-    writer->op_load(tile_in10, src->tensor(), in10_sampler);
-    writer->op_load(tile_in11, src->tensor(), in11_sampler);
-
-    TensorTileSampler dst_sampler;
-    dst_sampler.x(tile_co);
-    dst_sampler.y(tile_xo);
-    dst_sampler.z(tile_yo);
-    dst_sampler.b(tile_bo);
-    dst_sampler.height(m0);
-    dst_sampler.width(n0);
-    dst_sampler.format(TensorSamplerFormat::C_W_H);
-    // Do not write to the same memory location with multiple threads
-    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
-    dst_sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    auto &tile_dst = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
-    dst->init_virtual_tensor(tile_dst, dst_sampler);
+    auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+    auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+    writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+    writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+    auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
+    auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
+    writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32);
+    writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32);
+
+    auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+    auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+    auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
+    auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
+
+    writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1);
+    writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1);
+    writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1);
+
+    auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0));
+    auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+    writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+    writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo);
+    writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo);
+    writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo);
 
     // Weights of each nearest pixel
-    const auto &tile_a  = writer->declare_tile("a", ckw::DataType::Fp32);
-    const auto &tile_b  = writer->declare_tile("b", ckw::DataType::Fp32);
-    const auto &tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
-    const auto &tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
+    auto tile_a  = writer->declare_tile("a", ckw::DataType::Fp32);
+    auto tile_b  = writer->declare_tile("b", ckw::DataType::Fp32);
+    auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
+    auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
 
     // a = (xi_f - (float)xi)
     // b = (1.f - a)
     // a1 = (yi_f - (float)yi)
     // b1 = (1.f - a1)
-    const auto &tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
-    const auto &tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
-    writer->op_cast_expression(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
-    writer->op_cast_expression(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
-
-    writer->op_binary_expression(tile_a, tile_xi_f, BinaryOp::Sub, tile_xi_float);
-    writer->op_binary_expression(tile_b, tile_1, BinaryOp::Sub, tile_a);
-    writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float);
-    writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1);
-
-    if (is_data_type_float(_src->data_type()))
-    {
-        // Cast weights to source type
-        const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
-        const auto &tile_b_src_type  = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
-        const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
-        const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
-
-        writer->op_cast_expression(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
-
-        // in00 * b * b1
-        writer->op_binary_expression(tile_in00, tile_in00, BinaryOp::Mul, tile_b_src_type);
-        writer->op_binary_expression(tile_in00, tile_in00, BinaryOp::Mul, tile_b1_src_type);
-
-        // in01 * a * b1
-        writer->op_binary_expression(tile_in01, tile_in01, BinaryOp::Mul, tile_a_src_type);
-        writer->op_binary_expression(tile_in01, tile_in01, BinaryOp::Mul, tile_b1_src_type);
-
-        // in10 * b * a1
-        writer->op_binary_expression(tile_in10, tile_in10, BinaryOp::Mul, tile_b_src_type);
-        writer->op_binary_expression(tile_in10, tile_in10, BinaryOp::Mul, tile_a1_src_type);
-
-        // in11 * a * a1
-        writer->op_binary_expression(tile_in11, tile_in11, BinaryOp::Mul, tile_a_src_type);
-        writer->op_binary_expression(tile_in11, tile_in11, BinaryOp::Mul, tile_a1_src_type);
-
-        // Summation of above terms
-        writer->op_assign(tile_dst, tile_in00);
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in01);
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in10);
-        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_in11);
-    }
-    else
-    {
-        // Cast to float
-        const auto &tile_in00_f = writer->declare_tile("in00_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        const auto &tile_in01_f = writer->declare_tile("in01_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        const auto &tile_in10_f = writer->declare_tile("in10_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        const auto &tile_in11_f = writer->declare_tile("in11_f", TileInfo(ckw::DataType::Fp32, 1, n0));
-        writer->op_cast_expression(tile_in00_f, tile_in00, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_in01_f, tile_in01, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_in10_f, tile_in10, ckw::ConvertPolicy::None);
-        writer->op_cast_expression(tile_in11_f, tile_in11, ckw::ConvertPolicy::None);
-
-        // in00 * b * b1
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Mul, tile_b);
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Mul, tile_b1);
-
-        // in01 * a * b1
-        writer->op_binary_expression(tile_in01_f, tile_in01_f, BinaryOp::Mul, tile_a);
-        writer->op_binary_expression(tile_in01_f, tile_in01_f, BinaryOp::Mul, tile_b1);
-
-        // in10 * b * a1
-        writer->op_binary_expression(tile_in10_f, tile_in10_f, BinaryOp::Mul, tile_b);
-        writer->op_binary_expression(tile_in10_f, tile_in10_f, BinaryOp::Mul, tile_a1);
-
-        // in11 * a * a1
-        writer->op_binary_expression(tile_in11_f, tile_in11_f, BinaryOp::Mul, tile_a);
-        writer->op_binary_expression(tile_in11_f, tile_in11_f, BinaryOp::Mul, tile_a1);
-
-        // Summation of above terms
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in01_f);
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in10_f);
-        writer->op_binary_expression(tile_in00_f, tile_in00_f, BinaryOp::Add, tile_in11_f);
-
-        // Cast to destination type with saturation
-        writer->op_cast_expression(tile_dst, tile_in00_f, ckw::ConvertPolicy::Saturate);
-    }
+    auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
+    auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
+    writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
+
+    writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float);
+    writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a);
+    writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float);
+    writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1);
+
+    // Cast weights to source type
+    const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
+    const auto &tile_b_src_type  = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
+    const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
+    const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
+
+    writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
+    writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
+
+    // in00 * b * b1
+    writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type);
+    writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type);
+
+    // in01 * a * b1
+    writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type);
+    writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type);
+
+    // in10 * b * a1
+    writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type);
+    writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type);
+
+    // in11 * a * a1
+    writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type);
+    writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type);
+
+    // Summation of above terms
+    writer->op_assign(tile_dst, tile_in00);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10);
+    writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11);
 }
 
 void GpuCkwResize::write_component_code(const ComponentGroup    &comp_group,
@@ -492,8 +534,8 @@ Window GpuCkwResize::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
-    const unsigned int n0  = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
-    Window             win = calculate_max_window(*_dst, Steps(n0));
+    const uint32_t n0  = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
+    Window         win = calculate_max_window(*_dst, Steps(n0));
     return win.collapse(win, Window::DimZ);
 }
 
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 889706b0c0..d9d741fea5 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,11 @@
 
 #include "arm_compute/core/Error.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
+#include <cstdint>
 #include <string>
 
 namespace arm_compute
@@ -48,14 +48,90 @@ void GpuCkwStore::write_component_code(const ComponentGroup    &comp_group,
                                        GpuCkwVariableTable     &vtable,
                                        GpuCkwScopedKernelWriter writer) const
 {
-    auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    /********************************************************************************
+     * 1 - Define tensors
+     ********************************************************************************/
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
 
-    auto       &src_tile   = src->tile();
-    const auto &sampler    = src->tile_sampler();
-    auto       &dst_tensor = dst->tensor();
+    /********************************************************************************
+     * 2 - Define CKW constants
+     ********************************************************************************/
+    const auto dst_h = static_cast<int32_t>(_dst->dimension(2));
 
-    writer->op_store(dst_tensor, src_tile, sampler);
+    auto const_0_i32     = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+    auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+    auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 3 - Define the compute block parameters and destination tile (if not root component)
+     *     Bind the tile to the tensor to share it among different components and
+     *     initialize the compute block parameters
+     ********************************************************************************/
+    const auto &tile_src    = src->tile();
+    auto       &sampler_src = src->tensor_sampler();
+
+    const auto    dst_n0         = static_cast<int32_t>(tile_src.tile_info().width());
+    const auto    dst_m0         = static_cast<int32_t>(tile_src.tile_info().height());
+    const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+    const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+    /********************************************************************************
+     * 4 - Define the compute block parameters CKW constants
+     ********************************************************************************/
+    auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+    auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+    auto const_shift_back_n0_i32 =
+        writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+    /********************************************************************************
+     * 5 - Define the samplers for the input tensor
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 6 - Extra operations required before writing the main code
+     ********************************************************************************/
+    // Not required
+
+    /********************************************************************************
+     * 7 - Get the coordinates of the destination tile
+     ********************************************************************************/
+    auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+    auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+    auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+    auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+    auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+    // Calculate coordinates
+    get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+                                            const_0_i32);
+    get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32);
+
+    // Get the boundary aware coordinates at each global dimension index
+    if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+    {
+        writer->op_assign(tile_mout1, const_0_i32);
+        get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+    }
+    else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+    {
+        // For tile_mout1 and tile_bout0 the step can only be 1
+        writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+        writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+    }
+
+    /********************************************************************************
+     * 8 - Write the rest of the code
+     ********************************************************************************/
+    writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
 }
 
 std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
index 5f8ce919e3..1e6f0841ad 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-
-#include "ckw/Error.h"
-#include "ckw/TileInfo.h"
-
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "CkwHelper.h"
 
 namespace arm_compute
 {
@@ -35,29 +29,28 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-
-GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+                             ckw::TileOperand        &coord,
+                             const ckw::TileOperand  &gid,
+                             ckw::TileOperand        &step)
 {
+    writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
 }
 
-void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+                                             ckw::TileOperand        &coord,
+                                             const ckw::TileOperand  &gid,
+                                             ckw::TileOperand        &step,
+                                             ckw::TileOperand        &shift_back,
+                                             ckw::TileOperand        &const_0)
 {
-    if (!tensor_or_tile->has_tile())
-    {
-        CKW_ASSERT(tensor_or_tile->has_tensor());
+    // Applied formula: max((gid * step) - shift_back, 0)
+    // where the shift_back operand is: (step - leftover_step) % step
 
-        auto &tensor = tensor_or_tile->tensor();
-
-        const auto tile_name = tensor.name() + "_tile";
-        auto      &tile =
-            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
-
-        op_load(tile, tensor, sampler);
-
-        tensor_or_tile->init_virtual_tensor(tile, sampler);
-    }
+    writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+    writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back);
+    writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0);
 }
-
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
new file mode 100644
index 0000000000..956e7c8ecb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Get coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out]    coord  Resultant coordinate
+ * @param[in]     gid    Global work item id
+ * @param[in]     step   Step size / vector size
+ */
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+                             ckw::TileOperand        &coord,
+                             const ckw::TileOperand  &gid,
+                             ckw::TileOperand        &step);
+
+/** Get boundary aware coordinate along one axis.
+ *
+ * @param[in,out] writer     Writer
+ * @param[out]    coord      Resultant coordinate
+ * @param[in]     gid        Global work item id
+ * @param[in]     step       Step size / vector size
+ * @param[in]     shift_back It is (step - leftover_step) % step
+ * @param[in]     const_0    Constant tile of value 0
+ */
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+                                             ckw::TileOperand        &coord,
+                                             const ckw::TileOperand  &gid,
+                                             ckw::TileOperand        &step,
+                                             ckw::TileOperand        &shift_back,
+                                             ckw::TileOperand        &const_0);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
deleted file mode 100644
index 6ba2b2f651..0000000000
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
-
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "ckw/TensorTileSampler.h"
-
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-
-#include <algorithm>
-#include <functional>
-
-using namespace ckw;
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-using SamplerCreator = std::function<TensorTileSampler(GpuCkwScopedKernelWriter &, int32_t /* m0 */, int32_t /* n0 */)>;
-
-/** Load src and dst tiles of dimension [m0, n0] only when not loaded and prepare the sampler
- */
-inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer,
-                                                   GpuCkwComponentArgument  *src,
-                                                   GpuCkwComponentArgument  *dst,
-                                                   int32_t                   m0,
-                                                   int32_t                   n0,
-                                                   SamplerCreator            create_sampler)
-{
-    if (!src->has_tile())
-    {
-        const auto sampler = create_sampler(writer, m0, n0);
-        writer->op_load_once(src, sampler);
-    }
-    else
-    {
-        const auto &sampler = src->tile_sampler();
-        writer->op_load_once(src, sampler);
-    }
-
-    auto       &src_tile = src->tile();
-    const auto &sampler  = src->tile_sampler();
-
-    // Prepare the output tile.
-    if (!dst->has_tile())
-    {
-        auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
-        dst->init_virtual_tensor(tile, sampler);
-    }
-}
-
-/** Get boundary aware coordinate along one axis. Load and store of size step_v at the coordinate will not be out of bound
- *
- * @param[in,out] writer          Writer
- * @param[out]    coord           Resultant coordinate
- * @param[in]     gid             Global work item id
- * @param[in]     step_v          Step size / vector size
- * @param[in]     leftover_step_v Leftover step size at the boundary
- * @param[in]     prefix          Prefix to all the tiles declared within this function
- * @param[in]     const_0         Constant tile of value 0
- */
-inline void get_coord(GpuCkwScopedKernelWriter writer,
-                      TileOperand             &coord,
-                      const TileOperand       &gid,
-                      int32_t                  step_v,
-                      int32_t                  leftover_step_v,
-                      const std::string       &prefix,
-                      const TileOperand       &const_0)
-{
-    auto &step          = writer->declare_tile(prefix + "step", step_v);
-    auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v);
-
-    // step - leftover_step
-    auto &step_minus_leftover = writer->declare_tile(prefix + "step_minus_leftover", ckw::DataType::Int32);
-    writer->op_binary_expression(step_minus_leftover, step, ckw::BinaryOp::Sub, leftover_step);
-
-    // (step - leftover_step) % step
-    auto &coord_correction = writer->declare_tile(prefix + "coord_correction", ckw::DataType::Int32);
-    writer->op_binary_expression(coord_correction, step_minus_leftover, ckw::BinaryOp::Mod, step);
-
-    // (gid * step)
-    auto &raw_coord = writer->declare_tile(prefix + "raw_coord", ckw::DataType::Int32);
-    writer->op_binary_expression(raw_coord, gid, ckw::BinaryOp::Mul, step);
-
-    // (gid * step) - (step - leftover_step) % step
-    auto &corrected_coord = writer->declare_tile(prefix + "corrected_coord", ckw::DataType::Int32);
-    writer->op_binary_expression(corrected_coord, raw_coord, ckw::BinaryOp::Sub, coord_correction);
-
-    // max((gid * step) - (step - leftover_step) % step, 0)
-    writer->op_binary_elementwise_function(coord, ckw::BinaryFunction::Max, corrected_coord, const_0);
-}
-
-/** Declare coordinate tiles "{prefix}_dim0_coord" and "{prefix}_dim1_coord", and create a boundary-aware sampler from tile of size [n0, m0], against the overall dimensions [dim0, dim1]
- * The load and store of tile [n0, m0] will never be out of bound of [dim0, dim1]
- */
-
-/** Declare coordinate tiles "{prefix}_dim0_coord" and "{prefix}_dim1_coord", and create a boundary-aware sampler from tile of size [n0, m0], against the overall dimensions [dim0, dim1]
- * The load and store of tile [n0, m0] will never be out of bound of [dim0, dim1]
- *
- * @param[in,out] writer  Writer
- * @param[in]     gid_0   Global work item id 0
- * @param[in]     gid_1   Global work item id 1
- * @param[in]     dim0_v  Dimension 0
- * @param[in]     dim1_v  Dimension 1
- * @param[in]     n0_v    Tile size dimension 0
- * @param[in]     m0_v    Tile size dimension 1
- * @param[in]     prefix  Prefix to all the tiles declared within this function
- * @param[in]     const_0 Constant tile of value 0
- *
- * @return TensorTileSampler
- */
-inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer,
-                                                          TileOperand             &gid_0,
-                                                          TileOperand             &gid_1,
-                                                          int32_t                  dim0_v,
-                                                          int32_t                  dim1_v,
-                                                          int32_t                  n0_v,
-                                                          int32_t                  m0_v,
-                                                          const std::string        prefix,
-                                                          TileOperand             &const_0)
-{
-    // Clamp tile size [n0, m0] against dimension [dim0, dim1]
-    // This is needed to:
-    // * Guard against tile sizes are bigger than the tensor dimensions
-    // * Handle broadcasting tiles (e.g. src tensor is of size 1 in one of the dimensions)
-    n0_v                       = utility::clamp(n0_v, 1, dim0_v);
-    m0_v                       = utility::clamp(m0_v, 1, dim1_v);
-    const int32_t partial_n0_v = dim0_v % n0_v;
-    const int32_t partial_m0_v = dim1_v % m0_v;
-
-    // Declare #prefix_dim0_coord and #prefix_dim1_coord
-    auto &dim0_coord = writer->declare_tile(prefix + "dim0_coord", ckw::DataType::Int32);
-    get_coord(writer, dim0_coord, gid_0, n0_v, partial_n0_v, prefix + "dim0_", const_0);
-    auto &dim1_coord = writer->declare_tile(prefix + "dim1_coord", ckw::DataType::Int32);
-    get_coord(writer, dim1_coord, gid_1, m0_v, partial_m0_v, prefix + "dim1_", const_0);
-
-    // Set sampler
-    // Only set fields related to boundary aware loading/storing. Other info (e.g. format) is not responsibility of this function
-    TensorTileSampler sampler;
-
-    sampler.x(dim0_coord);
-    sampler.y(dim1_coord);
-
-    sampler.width(n0_v);
-    sampler.height(m0_v);
-
-    sampler.address_mode_x(TensorSamplerAddressModeX::None);
-    sampler.address_mode_y(TensorSamplerAddressModeY::None);
-    sampler.address_mode_z(TensorSamplerAddressModeZ::None);
-
-    return sampler;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_WRITERHELPER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
new file mode 100644
index 0000000000..ad31b06362
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::DataType to_ckw(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::F32:
+            return ckw::DataType::Fp32;
+        case DataType::F16:
+            return ckw::DataType::Fp16;
+        case DataType::S32:
+            return ckw::DataType::Int32;
+        case DataType::S16:
+            return ckw::DataType::Int16;
+        case DataType::S8:
+        case DataType::QASYMM8_SIGNED:
+            return ckw::DataType::Int8;
+        case DataType::U32:
+            return ckw::DataType::Uint32;
+        case DataType::U16:
+            return ckw::DataType::Uint16;
+        case DataType::U8:
+        case DataType::QASYMM8:
+            return ckw::DataType::Uint8;
+        default:
+            return ckw::DataType::Unknown;
+    }
+}
+
+ckw::TensorShape to_ckw(const TensorShape &shape)
+{
+    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
+    /// NOTE: Overflow danger. Use size_t?
+    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+                            static_cast<int32_t>(shape[4])};
+}
+
+ckw::TensorDataLayout to_ckw(DataLayout dl)
+{
+    switch (dl)
+    {
+        case DataLayout::NHWC:
+            return ckw::TensorDataLayout::Nhwc;
+        case DataLayout::NDHWC:
+            return ckw::TensorDataLayout::Ndhwc;
+        default:
+            return ckw::TensorDataLayout::Unknown;
+    }
+}
+
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
+{
+    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
+}
+
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+    switch (storage)
+    {
+        case TensorStorageType::ClBufferUint8Ptr:
+            return ckw::TensorStorageType::BufferUint8Ptr;
+        case TensorStorageType::ClImage2dReadOnly:
+            return ckw::TensorStorageType::Texture2dReadOnly;
+        case TensorStorageType::ClImage2dWriteOnly:
+            return ckw::TensorStorageType::Texture2dWriteOnly;
+        case TensorStorageType::Unknown:
+            return ckw::TensorStorageType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown tensor storage type");
+    }
+}
+
+TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+    switch (component)
+    {
+        case ckw::TensorComponentType::OffsetFirstElement:
+            return TensorComponentType::OffsetFirstElement;
+        case ckw::TensorComponentType::Stride0:
+            return TensorComponentType::Stride0;
+        case ckw::TensorComponentType::Stride1:
+            return TensorComponentType::Stride1;
+        case ckw::TensorComponentType::Stride2:
+            return TensorComponentType::Stride2;
+        case ckw::TensorComponentType::Stride3:
+            return TensorComponentType::Stride3;
+        case ckw::TensorComponentType::Stride4:
+            return TensorComponentType::Stride4;
+        case ckw::TensorComponentType::Dim0:
+            return TensorComponentType::Dim0;
+        case ckw::TensorComponentType::Dim1:
+            return TensorComponentType::Dim1;
+        case ckw::TensorComponentType::Dim2:
+            return TensorComponentType::Dim2;
+        case ckw::TensorComponentType::Dim3:
+            return TensorComponentType::Dim3;
+        case ckw::TensorComponentType::Dim4:
+            return TensorComponentType::Dim4;
+        case ckw::TensorComponentType::Dim1xDim2:
+            return TensorComponentType::Dim1xDim2;
+        case ckw::TensorComponentType::Dim2xDim3:
+            return TensorComponentType::Dim2xDim3;
+        case ckw::TensorComponentType::Dim1xDim2xDim3:
+            return TensorComponentType::Dim1xDim2xDim3;
+        case ckw::TensorComponentType::Unknown:
+            return TensorComponentType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+    }
+}
+
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+    switch (storage)
+    {
+        case ckw::TensorStorageType::BufferUint8Ptr:
+            return TensorStorageType::ClBufferUint8Ptr;
+        case ckw::TensorStorageType::Texture2dReadOnly:
+            return TensorStorageType::ClImage2dReadOnly;
+        case ckw::TensorStorageType::Texture2dWriteOnly:
+            return TensorStorageType::ClImage2dWriteOnly;
+        case ckw::TensorStorageType::Unknown:
+            return TensorStorageType::Unknown;
+        default:
+            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+    }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
index 5da317bf38..26740cdd04 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,151 +21,83 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
 
+#include "arm_compute/core/CoreTypes.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-#include "ckw/TensorInfo.h"
 
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
+#include "compute_kernel_writer/include/ckw/TensorInfo.h"
+#include "compute_kernel_writer/include/ckw/types/DataType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h"
+
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-inline ckw::DataType to_ckw(DataType dt)
-{
-    switch (dt)
-    {
-        case DataType::F32:
-            return ckw::DataType::Fp32;
-        case DataType::F16:
-            return ckw::DataType::Fp16;
-        case DataType::S32:
-            return ckw::DataType::Int32;
-        case DataType::S16:
-            return ckw::DataType::Int16;
-        case DataType::S8:
-        case DataType::QASYMM8_SIGNED:
-            return ckw::DataType::Int8;
-        case DataType::U32:
-            return ckw::DataType::Uint32;
-        case DataType::U16:
-            return ckw::DataType::Uint16;
-        case DataType::U8:
-        case DataType::QASYMM8:
-            return ckw::DataType::Uint8;
-        default:
-            return ckw::DataType::Unknown;
-    }
-}
+/** Convert the Compute Library data type to Compute Kernel Writer data type
+ *
+ * @param[in] dt The Compute Library data type
+ *
+ * @return the Compute Kernel Writer data type (ckw::DataType)
+ */
+ckw::DataType to_ckw(DataType dt);
 
-inline ckw::TensorShape to_ckw(const TensorShape &shape)
-{
-    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
-    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
-    /// NOTE: Overflow danger. Use size_t?
-    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
-                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
-                            static_cast<int32_t>(shape[4])};
-}
-inline ckw::TensorDataLayout to_ckw(DataLayout dl)
-{
-    switch (dl)
-    {
-        case DataLayout::NHWC:
-            return ckw::TensorDataLayout::Nhwc;
-        case DataLayout::NDHWC:
-            return ckw::TensorDataLayout::Ndhwc;
-        default:
-            return ckw::TensorDataLayout::Unknown;
-    }
-}
-inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
-{
-    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
-                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
-}
+/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape
+ *
+ * @param[in] shape The Compute Library tensor shape
+ *
+ * @return the Compute Kernel Writer tensor shape (ckw::TensorShape)
+ */
+ckw::TensorShape to_ckw(const TensorShape &shape);
 
-inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
-{
-    switch (component)
-    {
-        case ckw::TensorComponentType::OffsetFirstElement:
-            return TensorComponentType::OffsetFirstElement;
-        case ckw::TensorComponentType::Stride0:
-            return TensorComponentType::Stride0;
-        case ckw::TensorComponentType::Stride1:
-            return TensorComponentType::Stride1;
-        case ckw::TensorComponentType::Stride2:
-            return TensorComponentType::Stride2;
-        case ckw::TensorComponentType::Stride3:
-            return TensorComponentType::Stride3;
-        case ckw::TensorComponentType::Stride4:
-            return TensorComponentType::Stride4;
-        case ckw::TensorComponentType::Dim0:
-            return TensorComponentType::Dim0;
-        case ckw::TensorComponentType::Dim1:
-            return TensorComponentType::Dim1;
-        case ckw::TensorComponentType::Dim2:
-            return TensorComponentType::Dim2;
-        case ckw::TensorComponentType::Dim3:
-            return TensorComponentType::Dim3;
-        case ckw::TensorComponentType::Dim4:
-            return TensorComponentType::Dim4;
-        case ckw::TensorComponentType::Dim1xDim2:
-            return TensorComponentType::Dim1xDim2;
-        case ckw::TensorComponentType::Dim2xDim3:
-            return TensorComponentType::Dim2xDim3;
-        case ckw::TensorComponentType::Dim1xDim2xDim3:
-            return TensorComponentType::Dim1xDim2xDim3;
-        case ckw::TensorComponentType::Unknown:
-            return TensorComponentType::Unknown;
-        default:
-            ARM_COMPUTE_ERROR("Unknown CKW tensor component");
-            return TensorComponentType::Unknown;
-    }
-}
+/** Convert the Compute Library data layout to Compute Kernel Writer data layout
+ *
+ * @param[in] dl The Compute Library data layout
+ *
+ * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout)
+ */
+ckw::TensorDataLayout to_ckw(DataLayout dl);
+
+/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info
+ *
+ * @param[in] tensor_info The Compute Library tensor info
+ *
+ * @return the Compute Kernel Writer tensor info (ckw::TensorInfo)
+ */
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info);
+
+/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage
+ *
+ * @param[in] storage The Compute Library tensor storage
+ *
+ * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType)
+ */
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage);
+
+/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component
+ *
+ * @param[in] component The Compute Kernel Writer tensor component
+ *
+ * @return the Compute Library tensor component
+ */
+TensorComponentType from_ckw(const ckw::TensorComponentType &component);
+
+/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage
+ *
+ * @param[in] storage The Compute Kernel Writer tensor storage
+ *
+ * @return the Compute Library tensor storage
+ */
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage);
 
-inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
-{
-    switch (storage)
-    {
-        case TensorStorageType::ClBufferUint8Ptr:
-            return ckw::TensorStorageType::BufferUint8Ptr;
-        case TensorStorageType::ClImage2dReadOnly:
-            return ckw::TensorStorageType::Texture2dReadOnly;
-        case TensorStorageType::ClImage2dWriteOnly:
-            return ckw::TensorStorageType::Texture2dWriteOnly;
-        case TensorStorageType::Unknown:
-            return ckw::TensorStorageType::Unknown;
-        default:
-            ARM_COMPUTE_ERROR("Unknown tensor storage type");
-            return ckw::TensorStorageType::Unknown;
-    }
-}
-inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
-{
-    switch (storage)
-    {
-        case ckw::TensorStorageType::BufferUint8Ptr:
-            return TensorStorageType::ClBufferUint8Ptr;
-        case ckw::TensorStorageType::Texture2dReadOnly:
-            return TensorStorageType::ClImage2dReadOnly;
-        case ckw::TensorStorageType::Texture2dWriteOnly:
-            return TensorStorageType::ClImage2dWriteOnly;
-        case ckw::TensorStorageType::Unknown:
-            return TensorStorageType::Unknown;
-        default:
-            ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
-            return TensorStorageType::Unknown;
-    }
-}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
index b916e6b28b..5630e390d5 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
 
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
-
-#include "ckw/KernelWriter.h"
-#include "ckw/TensorTileSampler.h"
-
-namespace ckw
-{
-class Kernel;
-} // namespace ckw
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
 namespace arm_compute
 {
@@ -39,29 +31,27 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-
-class GpuCkwComponentArgument;
-
-/** Extended implementation of kernel writer for dynamic fusion. */
-class GpuCkwKernelWriter : public ckw::KernelWriter
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
 {
-public:
-    /** Initialize a new instance of @ref GpuCkwKernelWriter class.
-     *
-     * @param[in] kernel The kernel to be generated.
-     */
-    explicit GpuCkwKernelWriter(ckw::Kernel &kernel);
-
-    /** Load the user tensor to the tile in the same component argument if it hasn't been loaded.
-     *
-     * @param[in] tensor_or_tile The component argument that is either a user tensor or a virtual tensor.
-     * @param[in] sampler        The tensor sampling information to load the tile.
-     */
-    void op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler);
-};
-
+    switch (attributes.operation())
+    {
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
+            return ckw::BinaryOp::Add;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
+            return ckw::BinaryOp::Sub;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
+            return ckw::BinaryOp::Div;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
+            return ckw::BinaryOp::Mul;
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
+        case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
+        default:
+            ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
+    }
+}
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
-
-#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
index 0cba258940..644a407702 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,42 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
-#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
-
-#include "ckw/types/Operators.h"
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
+#include "compute_kernel_writer/include/ckw/types/Operators.h"
+
 namespace arm_compute
 {
 namespace experimental
 {
 namespace dynamic_fusion
 {
-inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
-{
-    switch (attributes.operation())
-    {
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
-            return ckw::BinaryOp::Add;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
-            return ckw::BinaryOp::Sub;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
-            return ckw::BinaryOp::Div;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
-            return ckw::BinaryOp::Mul;
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
-        case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
-        default:
-            ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
-    }
-}
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes);
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
 
-#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY */
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index 409b191df5..5544963b3f 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@ Status ClComponentPool2d::validate(const Properties                &properties,
                                    const Attributes                &attributes,
                                    const Settings                  &settings)
 {
-    ARM_COMPUTE_UNUSED(properties);
+    ARM_COMPUTE_UNUSED(properties, settings);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
@@ -57,7 +57,7 @@ Status ClComponentPool2d::validate(const Properties                &properties,
     // 1. Check validity
     // Check if pooling is valid
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
+        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, true)),
         "Pooling region that is entirely outside input tensor is unsupported");
 
     // Matching data type
@@ -74,8 +74,8 @@ Status ClComponentPool2d::validate(const Properties                &properties,
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
-        dst->tensor_shape(), misc::shape_calculator::compute_pool_shape(
-                                 *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
+        dst->tensor_shape(),
+        misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, true)));
 
     // 2. Check support level
     // Data type
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
index 2cec67dc65..201c9f243c 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,8 +49,7 @@ Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *l
 Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
-                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common is_supported_op
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
index 6f35e66ea8..d25a2a3153 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,12 +57,8 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check support level
     // Data Type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
-        src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
-        DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8,
-                                                         DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32,
-                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::F16, DataType::F32);
 
     if (context.gpu_language() == GpuLanguage::OpenCL)
     {
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
index 55c604aacc..2d04f75610 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
 
 #include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
@@ -52,10 +53,12 @@ void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
                                      const Pool2dAttributes  &attributes,
                                      const GpuPool2dSettings &settings)
 {
+    ARM_COMPUTE_UNUSED(settings);
+
     if (dst->total_size() == 0U)
     {
         auto shape = misc::shape_calculator::compute_pool_shape(
-            *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+            *src, convert_pool_attr_to_pool_info(attributes, /* mixed_precision */ true));
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
 }
@@ -63,17 +66,6 @@ void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
 constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-GpuPool2dSettings &GpuPool2dSettings::mixed_precision(bool mixed_precision)
-{
-    _mixed_precision = mixed_precision;
-    return *this;
-}
-
-bool GpuPool2dSettings::mixed_precision() const
-{
-    return _mixed_precision;
-}
-
 GpuPool2dSettings GpuPool2dSettings::use_inf_as_limit(bool use_inf_as_limit)
 {
     _use_inf_as_limit = use_inf_as_limit;
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
index fb09875b33..8e794c88b2 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023 Arm Limited.
+ * Copyright (c) 2022-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
                               const ResizeAttributes   &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
@@ -73,8 +72,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
 
     // Check support level
     // Data type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Interpolation policy
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
index e5d62c9930..c53453a15c 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,8 +36,7 @@ namespace dynamic_fusion
 Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
-                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common validate_op
@@ -49,8 +48,7 @@ Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *l
 Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
-                                                         DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common is_supported_op
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
index ebb0374501..8936db6abe 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,9 +78,8 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
 
 std::string ClTemplatePool2d::get_MxN_kernel_code() const
 {
-    const auto pool_type = _attributes.pool_type();
-    const bool fp_mixed_precision =
-        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    const auto pool_type          = _attributes.pool_type();
+    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
 
     // Define pool op macro.
     std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
@@ -226,11 +225,10 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
 
 std::string ClTemplatePool2d::get_2x2_kernel_code() const
 {
-    const auto pool_type = _attributes.pool_type();
-    const bool fp_mixed_precision =
-        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
-                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    const auto  pool_type          = _attributes.pool_type();
+    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && pool_type != PoolingType::MAX;
+    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                                     : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     std::string code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
@@ -385,12 +383,12 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
     lut["meta_kernel_id"] = id();
 
     // Retrieve relevant data
-    const auto padding                = _attributes.pad();
-    const auto stride                 = _attributes.stride();
-    const auto pool_size              = _attributes.pool_size();
-    const auto data_type              = _src->data_type();
-    const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() &&
-                                        _attributes.pool_type() != PoolingType::MAX;
+    const auto padding   = _attributes.pad();
+    const auto stride    = _attributes.stride();
+    const auto pool_size = _attributes.pool_size();
+    const auto data_type = _src->data_type();
+    const auto use_fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _attributes.pool_type() != PoolingType::MAX;
     const std::string max_initial_value =
         _settings.use_inf_as_limit() ? "(-INFINITY)"
                                      : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
author	Gunes Bayir <gunes.bayir@arm.com>	2024-01-17 16:07:03 +0000
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2024-02-01 16:00:34 +0000
commit	2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c (patch)
tree	a4e2d5ce46443a79a0778e4960462ce3edf106ec /src/dynamic_fusion
parent	7ab7fca87cca8775f82b0e9efec6a40975910c17 (diff)
download	ComputeLibrary-2b9fa593a0a172bf36a02b5cdb840c6b9b361d7c.tar.gz