14 files changed, 618 insertions, 35 deletions
diff --git a/Android.bp b/Android.bp
index b7936e5671..14290b9e1c 100644
--- a/Android.bp
+++ b/Android.bp
@@ -647,6 +647,7 @@ cc_library_static {
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
         "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp",
diff --git a/compute_kernel_writer/CMakeLists.txt b/compute_kernel_writer/CMakeLists.txt
index 1e82f9c6b3..9a97563025 100644
--- a/compute_kernel_writer/CMakeLists.txt
+++ b/compute_kernel_writer/CMakeLists.txt
@@ -102,7 +102,7 @@ target_compile_options(ckw
     ${CKW_CXX_FLAGS}
     "$<$<CXX_COMPILER_ID:GNU>:${GNU_WARNINGS}>"
     "$<$<CONFIG:Debug>:${CKW_ASSERTS_OPTS}>"
-    "$<$<BOOL:${CKW_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
+    "$<$<BOOL:${CKW_ENABLE_ASSERTS}>:${CKW_ASSERTS_OPTS}>"
     # Set CMAKE_CXX_FLAGS last so user can overwrite options
     ${CMAKE_CXX_FLAGS}
     PRIVATE
@@ -113,7 +113,7 @@ target_compile_options(ckw
 target_compile_definitions(ckw PUBLIC
     $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_DEBUG_ENABLED>
     $<$<CONFIG:Debug>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
-    $<$<BOOL:${CKW_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
+    $<$<BOOL:${CKW_ENABLE_ASSERTS}>:COMPUTE_KERNEL_WRITER_ASSERTS_ENABLED>
     $<$<BOOL:${CKW_ENABLE_OPENCL}>:COMPUTE_KERNEL_WRITER_OPENCL_ENABLED>
 )
 
diff --git a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
index c116e62650..72f85c78aa 100644
--- a/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
+++ b/compute_kernel_writer/prototype/include/ckw/KernelWriter.h
@@ -129,11 +129,38 @@ public:
 
     /** Load the data from the tensor memory to the tile using the sampling information.
      *
+     * @param[out] tile       The tile to be loaded.
+     * @param[in]  tensor     The tensor to be read.
+     * @param[in]  sampler    The tensor sampling information.
+     * @param[in]  dilation_y Dilation in the Y dimension.
+     */
+    void op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y = TileOperand("dil_y", 1));
+
+    /** Load the data from the tensor memory to the tile using the indirect buffer approach and respective of the sampling information.
+     *
      * @param[out] tile    The tile to be loaded.
      * @param[in]  tensor  The tensor to be read.
      * @param[in]  sampler The tensor sampling information.
      */
-    void op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler);
+    void op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler);
+
+    /** Construct an indirection buffer in @p tile containing the precalculated addresses of elements in the source tensor.
+     *
+     * @param[out] tile    The tile to be loaded.
+     * @param[in]  tensor  The tensor the be read.
+     * @param[in]  sampler The tensor sampling information.
+     * @param[in]  x       The X coordinate.
+     * @param[in]  y       The Y coordinate.
+     * @param[in]  x_off   Offset in the X dimension.
+     * @param[in]  y_off   Offset in the Y dimension.
+     */
+    void util_get_indirect_buffer(TileOperand             &tile,
+                                  const TensorOperand     &tensor,
+                                  const TensorTileSampler &sampler,
+                                  const TileOperand       &x,
+                                  const TileOperand       &y,
+                                  const TileOperand       &x_off,
+                                  const TileOperand       &y_off);
 
     /** Store the tile to the tensor using the specified sampling information.
      *
diff --git a/compute_kernel_writer/prototype/include/ckw/TileOperand.h b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
index c071707a45..24ee373a24 100644
--- a/compute_kernel_writer/prototype/include/ckw/TileOperand.h
+++ b/compute_kernel_writer/prototype/include/ckw/TileOperand.h
@@ -37,6 +37,8 @@ namespace ckw
 
 class Kernel;
 
+using TileContainer = std::vector<std::vector<std::string>>;
+
 /** Tile operand which can be either scalar, vector or 2D tile. */
 class TileOperand : public OperandBase
 {
@@ -69,6 +71,13 @@ public:
      */
     TileOperand(const ::std::string &name, float value);
 
+    /** Initialize a new instance of @ref TileOperand for compile-time constant variable.
+     *
+     * @param[in] name  The name of the tile.
+     * @param[in] value The value of the tile.
+     */
+    TileOperand(const ::std::string &name, const ::std::vector<std::vector<std::string>> &value, DataType dt);
+
     /** Prohibit copy of tile operand. */
     TileOperand(const TileOperand &) = delete;
 
@@ -96,13 +105,21 @@ public:
     /** Get the scalar value of the tile.
      *
      * The tile must have the shape of 1, 1 (i.e. scalar).
+     *
+     * @return Scalar value as a string.
+     */
+    std::string scalar_value() const;
+
+    /** Get the values of the tile.
+     *
+     * @return 2D container of values.
      */
-    ScalarValue scalar_value() const;
+    const TileContainer &value() const;
 
 private:
-    TileInfo    _info;
-    ScalarValue _value{};
-    bool        _constant;
+    TileInfo      _info;
+    TileContainer _value{};
+    bool          _constant;
 };
 
 } // namespace ckw
diff --git a/compute_kernel_writer/prototype/src/KernelWriter.cpp b/compute_kernel_writer/prototype/src/KernelWriter.cpp
index 9122e518b4..f29cf12802 100644
--- a/compute_kernel_writer/prototype/src/KernelWriter.cpp
+++ b/compute_kernel_writer/prototype/src/KernelWriter.cpp
@@ -128,6 +128,10 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
             name,
             prototype::TileInfo(info.data_type(), info.width(), info.height()));
     }
+    else
+    {
+        _impl->declare_const_tile(name, operand.value(), operand.data_type());
+    }
 
     return operand;
 }
@@ -136,7 +140,7 @@ TileOperand &KernelWriter::declare_tile_operand(std::unique_ptr<TileOperand> ope
 // Load and store
 // =================================================================================================
 
-void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const TensorTileSampler &sampler)
+void KernelWriter::op_load(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler, const TileOperand &dilation_y)
 {
     prototype::TensorOperand impl_tensor(
         tensor.name(),
@@ -152,9 +156,59 @@ void KernelWriter::op_load(TileOperand &tile, TensorOperand &tensor, const Tenso
     auto impl_z = sampler.z().create_impl_operand(_impl.get());
     auto impl_b = sampler.b().create_impl_operand(_impl.get());
 
+    auto impl_dilation_y = dilation_y.create_impl_operand(_impl.get());
+
+    auto impl_dst = tile.create_impl_operand(_impl.get());
+
+    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b, impl_dilation_y);
+}
+
+void KernelWriter::op_load_indirect(TileOperand &tile, const TensorOperand &tensor, const TensorTileSampler &sampler)
+{
+    prototype::TensorOperand impl_tensor(
+        tensor.name(),
+        prototype::GpuSampler{
+            sampler.format(),
+            prototype::to_gpu_tensor_storage(tensor.storage_type()),
+            sampler.address_mode_x(),
+            sampler.address_mode_y(),
+            sampler.address_mode_z() });
+
+    auto impl_x = sampler.x().create_impl_operand(_impl.get());
+    auto impl_y = sampler.y().create_impl_operand(_impl.get());
+    auto impl_z = sampler.z().create_impl_operand(_impl.get());
+    auto impl_b = sampler.b().create_impl_operand(_impl.get());
+
+    auto impl_dst = tile.create_impl_operand(_impl.get());
+
+    _impl->op_load_indirect(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+}
+
+void KernelWriter::util_get_indirect_buffer(TileOperand             &tile,
+                                            const TensorOperand     &tensor,
+                                            const TensorTileSampler &sampler,
+                                            const TileOperand       &x,
+                                            const TileOperand       &y,
+                                            const TileOperand       &x_off,
+                                            const TileOperand       &y_off)
+{
+    prototype::TensorOperand impl_tensor(
+        tensor.name(),
+        prototype::GpuSampler{
+            sampler.format(),
+            prototype::to_gpu_tensor_storage(tensor.storage_type()),
+            sampler.address_mode_x(),
+            sampler.address_mode_y(),
+            sampler.address_mode_z() });
+
+    auto impl_x     = x.create_impl_operand(_impl.get());
+    auto impl_y     = y.create_impl_operand(_impl.get());
+    auto impl_x_off = x_off.create_impl_operand(_impl.get());
+    auto impl_y_off = y_off.create_impl_operand(_impl.get());
+
     auto impl_dst = tile.create_impl_operand(_impl.get());
 
-    _impl->op_load_immediate(impl_tensor, impl_dst, impl_x, impl_y, impl_z, impl_b);
+    _impl->util_get_indirect_buffer(impl_dst, impl_tensor, impl_x, impl_y, impl_x_off, impl_y_off);
 }
 
 void KernelWriter::op_store(TensorOperand &tensor, const TileOperand &tile, const TensorTileSampler &sampler)
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
index a8dc7fbfdb..2b519471ac 100644
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ b/compute_kernel_writer/prototype/src/Prototype.h
@@ -3009,7 +3009,7 @@ private:
             address += " + (";
             address += x + ") * sizeof(" + dst_type + ")";
         }
-        if(y != "0" && (_mapper.is_one_component_y() != true))
+        if(y != "0")
         {
             const std::string stride_y = _mapper.tensor_component_stride_y();
             address += " + (";
@@ -3249,7 +3249,7 @@ private:
         std::string coord_x = "(" + x + ") >> 2";
         std::string coord_y = "(";
 
-        if(y != "0" && (_mapper.is_one_component_y() != true))
+        if(y != "0")
         {
             coord_y += y;
         }
@@ -4024,13 +4024,6 @@ public:
             _data->code += ", ";
             _data->code += x_s->scalar(0, i).str;
             _data->code += " >= 0);\n";
-            // mi_0 = select(wxh, mi_0, y_s >= 0);
-            _data->code += dst->scalar(0, i).str;
-            _data->code += " = select(-1, ";
-            _data->code += dst->scalar(0, i).str;
-            _data->code += ", ";
-            _data->code += y_s->scalar(0, i).str;
-            _data->code += " >= 0);\n";
             // mi_0 = select(wxh, mi_0, x_s < width);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
@@ -4039,6 +4032,13 @@ public:
             _data->code += x_s->scalar(0, i).str;
             _data->code += " < ";
             _data->code += width + ");\n";
+            // mi_0 = select(wxh, mi_0, y_s >= 0);
+            _data->code += dst->scalar(0, i).str;
+            _data->code += " = select(-1, ";
+            _data->code += dst->scalar(0, i).str;
+            _data->code += ", ";
+            _data->code += y_s->scalar(0, i).str;
+            _data->code += " >= 0);\n";
             // mi_0 = select(wxh, mi_0, y_s < height);
             _data->code += dst->scalar(0, i).str;
             _data->code += " = select(-1, ";
diff --git a/compute_kernel_writer/prototype/src/TileOperand.cpp b/compute_kernel_writer/prototype/src/TileOperand.cpp
index fcb3cb6415..bf6a15b9df 100644
--- a/compute_kernel_writer/prototype/src/TileOperand.cpp
+++ b/compute_kernel_writer/prototype/src/TileOperand.cpp
@@ -30,22 +30,42 @@ namespace ckw
 {
 
 TileOperand::TileOperand(const std::string &name, const TileInfo &info)
-    : OperandBase(name), _info(info), _value{ 0 }, _constant(false)
+    : OperandBase(name),
+      _info(info),
+      _value{ std::vector<std::string>{ "0" } },
+      _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, DataType data_type)
-    : OperandBase(name), _info(TileInfo{ data_type }), _value(0), _constant(false)
+    : OperandBase(name),
+      _info(TileInfo{ data_type }),
+      _value{ std::vector<std::string>{ "0" } },
+      _constant(false)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, int32_t value)
-    : OperandBase(name), _info(TileInfo{ DataType::Int32 }), _value(value), _constant(true)
+    : OperandBase(name),
+      _info(TileInfo{ DataType::Int32 }),
+      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _constant(true)
 {
 }
 
 TileOperand::TileOperand(const std::string &name, float value)
-    : OperandBase(name), _info(TileInfo{ DataType::Fp32 }), _value(value), _constant(true)
+    : OperandBase(name),
+      _info(TileInfo{ DataType::Fp32 }),
+      _value{ std::vector<std::string>{ std::to_string(value) } },
+      _constant(true)
+{
+}
+
+TileOperand::TileOperand(const std::string &name, const TileContainer &vals, DataType dt)
+    : OperandBase(name),
+      _info(TileInfo{ dt, static_cast<int32_t>(vals.size()), static_cast<int32_t>(vals[0].size()) }),
+      _value(vals),
+      _constant(true)
 {
 }
 
@@ -55,17 +75,23 @@ prototype::Operand TileOperand::create_impl_operand(prototype::IGpuKernelWriter
 
     if(_constant)
     {
-        switch(_info.data_type())
+        if(is_scalar())
         {
-            case DataType::Int32:
-                return prototype::Operand(std::to_string(_value.get<int32_t>()),
-                                          prototype::OperandType::ScalarInt32);
+            switch(_info.data_type())
+            {
+                case DataType::Int32:
+                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarInt32);
 
-            case DataType::Fp32:
-                return prototype::Operand(std::to_string(_value.get<float>()), prototype::OperandType::ScalarFp32);
+                case DataType::Fp32:
+                    return prototype::Operand(_value[0][0], prototype::OperandType::ScalarFp32);
 
-            default:
-                CKW_ASSERT(false);
+                default:
+                    CKW_ASSERT(false);
+            }
+        }
+        else
+        {
+            return prototype::Operand(name());
         }
     }
     else
@@ -94,11 +120,16 @@ bool TileOperand::is_scalar() const
     return _info.width() == 1 && _info.height() == 1;
 }
 
-ScalarValue TileOperand::scalar_value() const
+std::string TileOperand::scalar_value() const
 {
     CKW_ASSERT(is_scalar());
     CKW_ASSERT(is_constant());
 
+    return _value[0][0];
+}
+
+const TileContainer &TileOperand::value() const
+{
     return _value;
 }
 
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 801f1f0b0f..edc0c3b9e7 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -48,6 +48,13 @@ v23.08 Public major release
      - @ref opencl::kernels::ClMatMulNativeMMULKernel support for FP32 and FP16, with batch support
    - Enable transposed convolution with non-square kernels on CPU and GPU.
    - Added support for input data type U64/S64 in CLCast.
+   - Added new Compute Kernel Writer (CKW) subproject that offers a C++ interface to generate tile-based OpenCL code in just-in-time fashion.
+   - Port the following kernels in the experimental Dynamic Fusion interface to use the new Compute Kernel Writer interface with support for FP16/FP32 only:
+     - @ref experimental::dynamic_fusion::GpuCkwActivation
+     - @ref experimental::dynamic_fusion::GpuCkwCast
+     - @ref experimental::dynamic_fusion::GpuCkwDirectConv2d
+     - @ref experimental::dynamic_fusion::GpuCkwElementwiseBinary
+     - @ref experimental::dynamic_fusion::GpuCkwStore
  - Various optimizations and bug fixes.
 
 v23.05.1 Public patch release
diff --git a/filelist.json b/filelist.json
index b7845a760a..953b81de5a 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2353,6 +2353,7 @@
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp",
+        "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp",
         "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp",
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
new file mode 100644
index 0000000000..3c906646a6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "ckw/TensorTileSampler.h"
+#include "ckw/TileInfo.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+using TileContainer = std::vector<std::vector<std::string>>;
+
+GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId                      id,
+                                       const ArgumentPack<ITensorInfo> &tensors,
+                                       const Attributes                &attributes,
+                                       const Settings                  &settings)
+    : IGpuCkwComponentDriver{ id, tensors },
+      _src{},
+      _wei{},
+      _bia{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
+}
+
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+{
+    const auto desc = _settings.direct_conv_descriptor();
+    ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
+                             "Only the weights tensor can be exported to cl_image");
+
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int width_idx   = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+    const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+    // Tunable parameters
+    const int32_t m0         = root_window.y().step();
+    const int32_t n0         = root_window.x().step();
+    const int32_t k0         = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
+    const int32_t partial_n0 = _dst->dimension(0) % n0;
+
+    const int32_t K = _src->dimension(channel_idx);
+
+    // Exporting the weights tensor to an OpenCL image object is currently only supported when:
+    //   a) k0 is equal to 4
+    // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
+    //   b) K is a multiple of 4
+    // This is a limitation in the current interface due to the variable table being responsible for maintaining
+    // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
+    // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
+    // extra loop to compute the left-over elements.
+    const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
+
+    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *wei = vtable.declare_variable(
+        comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
+    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *bia = nullptr;
+
+    const bool using_bias = _bia != nullptr;
+
+    if(using_bias)
+    {
+        bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
+    }
+
+    // Constants
+    const auto kernel_height    = static_cast<int32_t>(_wei->dimension(height_idx));
+    const auto kernel_width     = static_cast<int32_t>(_wei->dimension(width_idx));
+    const auto src_channels     = static_cast<int32_t>(_src->dimension(channel_idx));
+    auto      &tile_kernel_w    = writer->declare_tile("kernel_w", kernel_width);
+    auto      &tile_kernel_size = writer->declare_tile("kernel_size", kernel_width * kernel_height);
+    auto      &tile_src_c       = writer->declare_tile("src_c", static_cast<int32_t>(_src->dimension(channel_idx)));
+    auto      &tile_dst_w       = writer->declare_tile("dst_w", static_cast<int32_t>(_dst->dimension(width_idx)));
+    auto      &tile_stride_x    = writer->declare_tile("stride_x", static_cast<int32_t>(_attributes.stride().x()));
+    auto      &tile_stride_y    = writer->declare_tile("stride_y", static_cast<int32_t>(_attributes.stride().y()));
+    auto      &tile_pad_x       = writer->declare_tile("pad_x", static_cast<int32_t>(_attributes.pad().left));
+    auto      &tile_pad_y       = writer->declare_tile("pad_y", static_cast<int32_t>(_attributes.pad().top));
+    auto      &tile_k0          = writer->declare_tile("k0", k0);
+    auto      &tile_0           = writer->declare_tile("0", 0);
+    auto      &tile_1           = writer->declare_tile("1", 1);
+
+    auto &tile_gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
+    auto &tile_gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
+    auto &tile_gid_2 = writer->declare_tile("gid_2", ckw::DataType::Int32);
+
+    writer->op_get_global_id(tile_gid_0, 0);
+    writer->op_get_global_id(tile_gid_1, 1);
+    writer->op_get_global_id(tile_gid_2, 2);
+
+    auto &tile_cout = writer->declare_tile("cout", ckw::DataType::Int32); // OFM
+    auto &tile_mout = writer->declare_tile("mout", ckw::DataType::Int32); // WIDTH x HEIGHT
+    auto &tile_bout = writer->declare_tile("bout", ckw::DataType::Int32); // BATCH SIZE IDX
+
+    // Get the boundary aware coordinates at each global dimension index
+    get_coord(writer, tile_cout, tile_gid_0, n0, partial_n0, tile_cout.name() + "_dim0_", tile_0);
+    get_coord(writer, tile_mout, tile_gid_1, m0, 0, tile_mout.name() + "_dim1_", tile_0);
+    get_coord(writer, tile_bout, tile_gid_2, 1, 0, tile_bout.name() + "_dim2_", tile_0);
+
+    TensorTileSampler src_sampler;
+    src_sampler.width(k0);
+    src_sampler.height(m0);
+    src_sampler.format(TensorSamplerFormat::C_WH_1);
+    // We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+    // compute left-over elements
+    src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
+    // indirection buffer mi does not contain negative values representing out-of-bounds reads.
+    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly);
+    src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler wei_sampler;
+    wei_sampler.width(k0);
+    wei_sampler.height(n0);
+    wei_sampler.format(TensorSamplerFormat::C_WH_1);
+    // We cannot have out-of-bounds accesses for the weights
+    wei_sampler.address_mode_x(TensorSamplerAddressModeX::None);
+    wei_sampler.address_mode_y(TensorSamplerAddressModeY::None);
+    wei_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+
+    TensorTileSampler dst_sampler;
+    dst_sampler.width(n0);
+    dst_sampler.height(m0);
+    dst_sampler.format(TensorSamplerFormat::C_WH_1);
+    dst_sampler.address_mode_x(TensorSamplerAddressModeX::OverlappingMin);
+    dst_sampler.address_mode_y(TensorSamplerAddressModeY::ClampToMaxEdgeOnly);
+    dst_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
+    dst_sampler.x(tile_cout);
+    dst_sampler.y(tile_mout);
+    dst_sampler.z(tile_0);
+    dst_sampler.b(tile_bout);
+
+    if(!dst->has_tile())
+    {
+        auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
+        dst->init_virtual_tensor(tile, dst_sampler);
+    }
+    auto &tile_dst = dst->tile();
+
+    writer->op_assign(tile_dst, tile_0);
+
+    // We create a 2d container of size (M0, 1) to store the indices for iteration
+    TileContainer it;
+    for(int m = 0; m < m0; ++m)
+    {
+        std::vector<std::string> idx { std::to_string(m) };
+        it.push_back({ idx });
+    }
+    const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
+
+    auto &tile_xi = writer->declare_tile("xi", TileInfo(ckw::DataType::Int32, m0, 1));
+    auto &tile_yi = writer->declare_tile("yi", TileInfo(ckw::DataType::Int32, m0, 1));
+
+    // Convert the linear index to coordinate
+    // xi = ((mout + i) % dst_w) * stride_x - pad_x
+    // yi = ((mout + i) / dst_w) * stride_y - pad_y
+    writer->op_binary_expression(tile_xi, tile_mout, BinaryOp::Add, tile_it);
+    writer->op_binary_expression(tile_yi, tile_mout, BinaryOp::Add, tile_it);
+    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mod, tile_dst_w);
+    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Div, tile_dst_w);
+    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Mul, tile_stride_x);
+    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Mul, tile_stride_y);
+    writer->op_binary_expression(tile_xi, tile_xi, BinaryOp::Sub, tile_pad_x);
+    writer->op_binary_expression(tile_yi, tile_yi, BinaryOp::Sub, tile_pad_y);
+
+    auto &tile_y_b = writer->declare_tile("y_b", ckw::DataType::Int32);
+    writer->op_binary_expression(tile_y_b, tile_cout, BinaryOp::Mul, tile_kernel_size);
+
+    auto &tile_i = writer->declare_tile("i", ckw::DataType::Int32);
+    writer->op_assign(tile_i, tile_0);
+
+    // clang-format off
+    writer->op_for_loop(tile_i, BinaryOp::Less, tile_kernel_size, tile_i, AssignmentOp::Increment, tile_1, [&]()
+    {
+        auto &tile_x_k = writer->declare_tile("x_k", ckw::DataType::Int32);
+        auto &tile_y_k = writer->declare_tile("y_k", ckw::DataType::Int32);
+
+        writer->op_binary_expression(tile_x_k, tile_i, BinaryOp::Mod, tile_kernel_w);
+        writer->op_binary_expression(tile_y_k, tile_i, BinaryOp::Div, tile_kernel_w);
+
+        auto &tile_ck = writer->declare_tile("ck", ckw::DataType::Int32);
+        writer->op_assign(tile_ck, tile_0);
+
+        auto &tile_mi = writer->declare_tile("mi", TileInfo(ckw::DataType::Int32, m0, 1));
+        // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
+        // x_s = xi + x_k
+        // y_s = yi + y_k
+        // mi = x_s + y_s * width;
+        // mi = select(-1, mi, x_s >= 0);
+        // mi = select(-1, mi, x_s < width);
+        // mi = select(-1, mi, y_s >= 0);
+        // mi = select(-1, mi, y_s < height);
+        writer->util_get_indirect_buffer(tile_mi, src->tensor(), src_sampler, tile_xi, tile_yi, tile_x_k, tile_y_k);
+
+        src_sampler.x(tile_ck);
+        src_sampler.y(tile_mi);
+        src_sampler.z(tile_0);
+        src_sampler.b(tile_bout);
+
+        wei_sampler.x(tile_ck);
+        wei_sampler.y(tile_y_b);
+        wei_sampler.z(tile_0);
+        wei_sampler.b(tile_0);
+
+        auto &tile_src_c_minus_k0 = writer->declare_tile("src_c_minus_k0", src_channels - k0);
+
+        writer->op_for_loop(tile_ck, BinaryOp::LessEqual, tile_src_c_minus_k0, tile_ck, AssignmentOp::Increment, tile_k0, [&]()
+        {
+            auto &tile_lhs = writer->declare_tile("lhs", TileInfo(to_ckw(_src->data_type()), m0, k0));
+            auto &tile_rhs = writer->declare_tile("rhs", TileInfo(to_ckw(_wei->data_type()), n0, k0));
+            writer->op_assign(tile_lhs, tile_0);
+            writer->op_assign(tile_rhs, tile_0);
+
+            writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
+            writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+
+            writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+        });
+
+        // Left-over accumulations for when K is not a multiple of k0
+        if(!(K % k0 == 0))
+        {
+            writer->op_for_loop(tile_ck, BinaryOp::Less, tile_src_c, tile_ck, AssignmentOp::Increment, tile_1, [&]()
+            {
+                auto &tile_lhs = writer->declare_tile("lhs_leftover", TileInfo(to_ckw(_src->data_type()), m0, 1));
+                auto &tile_rhs = writer->declare_tile("rhs_leftover", TileInfo(to_ckw(_wei->data_type()), n0, 1));
+                writer->op_assign(tile_lhs, tile_0);
+                writer->op_assign(tile_rhs, tile_0);
+
+                writer->op_load_indirect(tile_lhs, src->tensor(), src_sampler);
+                writer->op_load(tile_rhs, wei->tensor(), wei_sampler, tile_kernel_size);
+
+                writer->op_binary_expression(tile_dst, tile_lhs, BinaryOp::MatMul_Nt_T, tile_rhs);
+            });
+        }
+
+    writer->op_binary_expression(tile_y_b, tile_y_b, BinaryOp::Add, tile_1);
+    });
+    // clang-format on
+
+    // Bias addition
+    // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+    // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+    if(using_bias)
+    {
+        if(!bia->has_tile())
+        {
+            // Reuse the destination sampler for the bias
+            writer->op_load_once(bia, dst_sampler);
+        }
+        auto &tile_bia = bia->tile();
+
+        writer->op_binary_expression(tile_dst, tile_dst, BinaryOp::Add, tile_bia);
+    }
+}
+
+Window GpuCkwDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto dst_shape = _dst->tensor_shape();
+    const auto desc      = _settings.direct_conv_descriptor();
+
+    const unsigned int n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+    const unsigned int m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+
+    Window win = calculate_max_window(dst_shape, Steps(n0, m0));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], m0);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0));
+    win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    return "direct_conv2d";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
new file mode 100644
index 0000000000..ac32d2df32
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwDirectConv2d final : public IGpuCkwComponentDriver
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+
+public:
+    /** Constructor
+     *
+     * For supported configurations please refer to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the component
+     * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+     * @param[in] settings   Component settings. Settings are a set of parameters that influence the implementation of a component
+     */
+    GpuCkwDirectConv2d(ComponentId                      id,
+                       const ArgumentPack<ITensorInfo> &tensors,
+                       const Attributes                &attributes,
+                       const Settings                  &settings);
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
+
+    /** Destructor */
+    ~GpuCkwDirectConv2d() override = default;
+
+    // Inherited methods overriden
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_wei;
+    const ITensorInfo *_bia;
+    const ITensorInfo *_dst;
+
+    const Attributes _attributes;
+    const Settings   _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index 3965deced1..a713c82003 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -26,8 +26,14 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 namespace arm_compute
 {
@@ -145,16 +151,27 @@ ClComponentDirectConv2d::ClComponentDirectConv2d(
     const Attributes                &attributes,
     const Settings                  &settings)
     : IGpuKernelComponent{ id, properties, tensors },
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
       _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{ std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings) }
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
+
 ClComponentDirectConv2d::~ClComponentDirectConv2d()
 {
 }
+
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+#else // ACL_INTERNAL_TEST_CKW_IN_DF
+const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
     return _component_writer.get();
 }
+
 } // namespace dynamic_fusion
 } // namespace experimental
 } // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index 8e555dce57..24acb1b2c1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -66,7 +66,11 @@ private:
 };
 
 /** Forward declaration */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateDirectConv2d;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+class GpuCkwDirectConv2d;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 class ClComponentDirectConv2d final : public IGpuKernelComponent
 {
@@ -134,8 +138,12 @@ public:
     ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
     /** Allow instances of this class to be moved */
     ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
-    /** Get template writer for the component */
+    /** Get writer for the component */
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override
     {
@@ -143,7 +151,11 @@ public:
     }
 
 private:
+#ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+    std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
+#endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 } // namespace dynamic_fusion
 } // namespace experimental
diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
index 5ab1fafe2f..f27a1796c9 100644
--- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
+++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 
-#ifndef ACL_INTERNAL_TEST_CKW_IN_DF // Do not include this test if ACL_INTERNAL_TEST_CKW_IN_DF and the op has not been ported to ckw
 #include "tests/AssetsLibrary.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/framework/Fixture.h"
@@ -250,5 +249,3 @@ TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
-
-#endif // ACL_INTERNAL_TEST_CKW_IN_DF