6 files changed, 1100 insertions, 0 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
new file mode 100644
index 0000000000..870de64eb8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateDirectConv2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId                      id,
+                                               const ArgumentPack<ITensorInfo> &tensors,
+                                               const Attributes                &attributes,
+                                               const Settings                  &settings)
+    : IGpuTemplateComponentWriter{ id, tensors },
+      _src{},
+      _weight{},
+      _bias{},
+      _dst{},
+      _attributes{ attributes },
+      _settings{ settings }
+{
+    _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    {
+        _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+    }
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
+}
+
+std::string ClTemplateDirectConv2d::get_name() const
+{
+    return "direct_conv2d";
+}
+
+std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+
+    const auto channel_idx   = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const auto k0            = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
+    const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
+
+    std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src)            {{src}}
+// IN_1(wei)            {{weight}}
+)_";
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+// IN_1(bia)            {{bias}}
+)_";
+    }
+    code += R"_(
+// OUT(dst, accum)      {{dst}}
+
+// Initialize the accumulators
+TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH {{WEI_WIDTH}}
+#define _IWEI_HEIGHT {{WEI_HEIGHT}}
+#define _ISRC_WIDTH {{src}}_w
+#define _ISRC_HEIGHT {{src}}_h
+#define _ISRC_CHANNELS {{src}}_c
+#define _IDST_WIDTH {{arg_dst}}_w
+#define _IDST_HEIGHT {{arg_dst}}_h
+#define _IDST_CHANNELS {{arg_dst}}_c
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
+        yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+        xi[i].v -= {{PAD_LEFT}};
+        yi[i].v -= {{PAD_TOP}};
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        {{dst}}[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+            TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+            ck += K0;
+        }
+
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+)_";
+
+    if(leftover_loop)
+    {
+        code += R"_(
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+            TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+            // Initialize tiles
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = {{ZERO_VALUE}};
+            })
+
+            LOOP_UNROLLING(int, i, 0, 1, N0,
+            {
+                b[i].v = {{ZERO_VALUE}};
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+            ++ck;
+        }
+    )_";
+}
+
+code += R"_(
+#undef _I_WEI_WIDTH
+#undef _I_WEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+
+    }
+)_";
+
+    if(_bias && _bias->has_valid_id())
+    {
+        code += R"_(
+        TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+        T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+    )_";
+}
+
+code += R"_(
+}
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+    return code;
+}
+
+void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(
+        _weight,
+        GpuKernelArgumentInfo(weight_type),
+        comp_group.is_intermediate_tensor(_weight),
+        "weight");
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        vtable.declare_variable(
+            _bias,
+            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
+            comp_group.is_intermediate_tensor(_bias),
+            "bias");
+    }
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+    // Arguments and global shared variables
+    lut["src"]    = vtable.get_variable(_src);
+    lut["weight"] = vtable.get_variable(_weight);
+
+    if(_bias && _bias->has_valid_id()) // optional bias
+    {
+        lut["bias"]          = vtable.get_variable(_bias);
+        lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
+    }
+    lut["dst"] = vtable.get_variable(_dst);
+
+    const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+    lut["arg_dst"]          = dst_argument.uniq_name;
+
+    // Local build options
+    lut["meta_kernel_id"] = id();
+    lut["ACC_DATA_TYPE"]  = _src->data_type();
+    lut["SRC_DATA_TYPE"]  = _src->data_type();
+    lut["WEI_DATA_TYPE"]  = _weight->data_type();
+
+    lut["SRC_TENSOR_TYPE"] = "BUFFER";
+    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+    {
+        lut["WEI_TENSOR_TYPE"] = "IMAGE";
+        break;
+    }
+        default:
+    {
+        lut["WEI_TENSOR_TYPE"] = "BUFFER";
+        break;
+    }
+    }
+    const auto width_idx  = 1;
+    const auto height_idx = 2;
+    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+
+    lut["STRIDE_X"] = _attributes.stride().x();
+    lut["STRIDE_Y"] = _attributes.stride().y();
+
+    lut["PAD_LEFT"] = _attributes.pad().left;
+    lut["PAD_TOP"]  = _attributes.pad().top;
+
+    lut["ZERO_VALUE"] = 0;
+
+    return lut;
+}
+
+CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
+{
+    const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+    const DataType     data_type   = _src->data_type();
+
+    /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
+    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0               = root_window.x().step();
+    const unsigned int m0               = root_window.y().step();
+    const unsigned int k0               = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
+    const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+    CLBuildOptions build_opts{};
+    if(_settings.fast_relaxed_math())
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+    else
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    build_opts.add_option("-DIS_TILED");
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    return build_opts;
+}
+
+std::string ClTemplateDirectConv2d::get_config_id() const
+{
+    const DataType   data_type   = _src->data_type();
+    const DataLayout data_layout = _src->data_layout();
+
+    const unsigned int width_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    const unsigned int kernel_size = _weight->dimension(width_idx);
+
+    std::string config_id{};
+    config_id += lower_string(string_from_data_type(data_type));
+    config_id += "_";
+    config_id += support::cpp11::to_string(kernel_size);
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().x());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_attributes.stride().y());
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+    config_id += "_";
+    config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+    config_id += "_";
+    config_id += lower_string(string_from_data_layout(data_layout));
+    return config_id;
+}
+
+std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
+{
+    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+}
+
+Window ClTemplateDirectConv2d::get_window() const
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+    const auto output_shape = _dst->tensor_shape();
+
+    const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
+    const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+    // Create and configure kernel window
+    Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+    const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+    win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+    win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+    return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
new file mode 100644
index 0000000000..48027a9b8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
+{
+public:
+    using Attributes = ClComponentDirectConv2d::Attributes;
+    using Settings   = ClComponentDirectConv2d::Settings;
+    /** Constructor
+     *
+     * Similar to @ref ClComponentDirectConv2d::validate()
+     *
+     * @param[in] id         Component id
+     * @param[in] tensors    Tensor arguments to the components
+     * @param[in] attributes Component attributes
+     * @param[in] settings   Component settings
+     */
+    ClTemplateDirectConv2d(ComponentId                      id,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the build options used in the component
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return CLBuildOptions Build options
+     */
+    CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+    /** Generate the component config id string used for tuning */
+    std::string get_config_id() const override;
+    /** Generate the header list used in the component */
+    std::set<std::string> get_headers_list() const override;
+    /** Generate the execution window for the component */
+    Window get_window() const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_weight;
+    const ITensorInfo *_bias;
+    const ITensorInfo *_dst;
+    Attributes         _attributes;
+    Settings           _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
new file mode 100644
index 0000000000..6c4b8f52f2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateStore.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+{
+    _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+    _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+
+std::string ClTemplateStore::get_name() const
+{
+    return "store";
+}
+
+std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
+{
+    ARM_COMPUTE_UNUSED(comp_group);
+    return R"_(
+//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+{
+// This also follows NHWC layout
+// g_ind_0 maps to global_id(0) maps to Channel
+// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window)
+// g_ind_2 maps to global_id(2) maps to N / Batch
+#define _IDST_WIDTH {{dst}}_w
+#define _IDST_HEIGHT {{dst}}_h
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+    T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+    //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+}
+
+)_";
+}
+
+void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    // ARM_COMPUTE_UNUSED(comp_group)
+    vtable.declare_variable(
+        _src,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_src),
+        "src");
+    vtable.declare_variable(
+        _dst,
+        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+        comp_group.is_intermediate_tensor(_dst),
+        "dst");
+}
+
+TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+    TagLUT lut{};
+
+    // Arguments and global shared variables
+    lut["src"] = vtable.get_variable(_src);
+    lut["dst"] = vtable.get_variable(_dst);
+
+    // Local build options
+    lut["meta_kernel_id"]  = id();
+    lut["DST_TENSOR_TYPE"] = "BUFFER";
+    const auto dst_info    = comp_group.get_dst_tensors()[0];
+    lut["DST_DATA_TYPE"]   = dst_info->data_type();
+
+    return lut;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
new file mode 100644
index 0000000000..3f97a82204
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateStore final : public IGpuTemplateComponentWriter
+{
+public:
+    /** Constructor
+     *
+     * @param[in] id      Component id
+     * @param[in] tensors Tensor arguments to the components
+     */
+    ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+    /** Prevent instances of this class from being copy constructed */
+    ClTemplateStore(const ClTemplateStore &store) = delete;
+    /** Prevent instances of this class from being copied */
+    ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
+    /** Allow instances of this class to be move constructed */
+    ClTemplateStore(ClTemplateStore &&store) = default;
+    /** Allow instances of this class to be moved */
+    ClTemplateStore &operator=(ClTemplateStore &&store) = default;
+    /** Generate kernel component name */
+    std::string get_name() const override;
+    /** Generate kernel component code template
+     *
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return std::string Component code
+     */
+    std::string get_component_code(const ComponentGroup &comp_group) const override;
+    /** Declare all variables used by the component in the @p vtable
+     *
+     * @param[out] vtable     Variable table
+     * @param[in]  comp_group Component group of which the component is a part of
+     */
+    void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+    /** Generate the tag look-up table used to instantiate the component code.
+     *
+     * @param[in] vtable     Variable table
+     * @param[in] comp_group Component group of which the component is a part of
+     *
+     * @return TagLUT  Tag lookup table
+     */
+    TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+
+private:
+    const ITensorInfo *_src;
+    const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
new file mode 100644
index 0000000000..cb643a741d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateWriter.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
+std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
+{
+    std::string replaced_code    = "";
+    bool        scanning_pattern = false;
+    std::string pattern_found    = "";
+    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    {
+        if(!scanning_pattern)
+        {
+            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            {
+                i += 1;
+                scanning_pattern = true;
+                pattern_found    = "";
+            }
+            else
+            {
+                replaced_code += code_template[i];
+            }
+        }
+        else
+        {
+            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            {
+                i += 1;
+                scanning_pattern = false;
+                std::string err  = "Pattern " + pattern_found + " not found in tags";
+                ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
+                replaced_code += tags.find(pattern_found)->second.value;
+            }
+            else
+            {
+                pattern_found += code_template[i];
+            }
+        }
+    }
+
+    return replaced_code;
+}
+ClTemplateWriter::~ClTemplateWriter()
+{
+}
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
+    : _components{ components }
+{
+}
+std::string ClTemplateWriter::get_name()
+{
+    return write_kernel_name();
+}
+std::string ClTemplateWriter::get_code()
+{
+    return write_code();
+}
+std::string ClTemplateWriter::get_config_id()
+{
+    std::string config_id = get_name();
+    for(const auto &comp : _components)
+    {
+        config_id += "--" + comp->template_writer()->get_config_id() + "--";
+    }
+
+    return config_id;
+}
+
+CLBuildOptions ClTemplateWriter::get_build_options()
+{
+    CLBuildOptions build_opts{};
+
+    for(const auto &comp : _components)
+    {
+        build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
+    }
+
+    return build_opts;
+}
+
+Window ClTemplateWriter::get_window() const
+{
+    const auto root_comp = _components.get_root_component();
+    ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+    return root_comp->template_writer()->get_window();
+}
+
+std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
+{
+    // Assemble GpuKernelArguments
+    std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
+    for(const auto t : _components.get_argument_tensors())
+    {
+        tensors.emplace(
+            t->id(),
+            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+    }
+    return tensors;
+}
+
+std::string ClTemplateWriter::write_code()
+{
+    ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
+
+    // These data structures will hold the data from all the components in the blueprint
+    std::set<std::string>    headers_list{};
+    std::set<std::string>    additional_macros{};
+    std::vector<std::string> component_codes{}; // vector because order matters
+
+    // Pass 1: Declare all kernel variables
+    for(auto &component : _components)
+    {
+        component->template_writer()->declare_variables(_vtable, _components);
+    }
+    // Pass 2: Generate component codes
+    for(auto &component : _components)
+    {
+        const auto component_writer       = component->template_writer();
+        auto       curr_headers_list      = component_writer->get_headers_list();
+        auto       curr_additional_macros = component_writer->get_additional_macros();
+        auto       curr_component_code    = component_writer->get_component_code(_components);
+        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        component_codes.push_back(replace_tags(curr_component_code, var_lut));
+
+        headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
+        if(!additional_macros.empty()) // Some components might not have any
+        {
+            additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
+        }
+    }
+
+    // Step 3: Assemble the data gathered by traversing the graph into the string "code"
+    std::string code = "";
+
+    for(auto &header : headers_list)
+    {
+#if defined(EMBEDDED_KERNELS)
+        code += CLKernelLibrary::get().get_program(header).first;
+#else  // defined(EMBEDDED_KERNELS)
+        code += "#include \"" + header + "\"\n";
+#endif // defined(EMBEDDED_KERNELS)
+    }
+
+    for(auto &macros : additional_macros)
+    {
+        code += macros;
+    }
+
+    code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors()));
+
+    code += "\n{\n\n";
+
+    code += "    //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
+    code += write_global_section();
+    code += "    //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+
+    for(const auto &component_code : component_codes)
+    {
+        code += component_code;
+    }
+
+    code += "}\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_global_section() const
+{
+    const auto dst_tensors = _components.get_dst_tensors();
+    ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
+    const auto dst_info   = dst_tensors[0];
+    const auto dst_w      = dst_info->dimension(0);
+    const auto tile_w     = std::max(1, get_window().x().step());
+    const auto tile_h     = std::max(1, get_window().y().step());
+    auto       leftover_w = dst_w % tile_w;
+
+    std::string code = "";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
+    code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
+
+    code += "    const bool g_cond_x = (g_ind_0 == 0);\n";
+    code += "    const bool g_cond_y = (g_ind_1 == 0);\n";
+
+    return code;
+}
+std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
+{
+    std::string code;
+    switch(var.kernel_argument_info.type)
+    {
+        case GpuKernelArgumentInfo::Type::Vector:
+        {
+            code += "\n    VECTOR_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + ")";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D:
+        {
+            code += "\n    IMAGE_DECLARATION(" + var.uniq_name + "),";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+        {
+            code += "\n    __read_only image2d_t " + var.uniq_name + "_img,";
+            code += "\n    unsigned int " + var.uniq_name + "_stride_z";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+            break;
+        }
+        case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+        {
+            code += "\n    TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
+        }
+    }
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
+{
+    std::string code = "\n__kernel void " + write_kernel_name() + "(";
+
+    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    {
+        code += write_argument_declaration(argument_list[i]) + ",";
+    }
+    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    {
+        code += write_argument_declaration(argument_list[argument_list.size() - 1]);
+    }
+
+    code += ')';
+
+    return code;
+}
+std::string ClTemplateWriter::write_kernel_name() const
+{
+    if(_components.empty())
+    {
+        return "empty_kernel";
+    }
+    std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
+    for(size_t i = 1; i < _components.size(); ++i)
+    {
+        name += "___";
+        name += _components[i]->template_writer()->get_name();
+    }
+
+    return name;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
new file mode 100644
index 0000000000..83f617b6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use a templated-string-based method to write kernel code
+ *  It stitches the component code templates together based on the valid fusion configuration.
+ *  It then instantiates the actual kernel code from the template and the generated tag lookup table.
+ */
+class ClTemplateWriter : public IGpuKernelWriter
+{
+public:
+    /** Instantiates a kernel code string from the kernel code template
+     * @note: some tags can be unused since they could be used only for the macros, or only for the component code
+     *
+     * @param[in] code_template Kernel code template
+     * @param[in] tags          Tag lookup table
+     *
+     * @return std::string  Instantiated kernel string
+     */
+    static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
+    /** Default constructor */
+    ClTemplateWriter() = default;
+    /** Constructor
+     *
+     * @param[in] components Kernel component group from which the kernel will be generated
+     */
+    ClTemplateWriter(const GpuKernelComponentGroup &components);
+    /** Destructor */
+    ~ClTemplateWriter() override;
+    /** Generate kernel name */
+    std::string get_name() override;
+    /** Generate kernel code */
+    std::string get_code() override;
+    /** Generate build options */
+    CLBuildOptions get_build_options() override;
+    /** Generate config id string of the entire kernel. This is used for tuning */
+    std::string get_config_id() override;
+    /** Generate execution window */
+    Window get_window() const override;
+    /** Get the kernel argument lists of the kernel*/
+    std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+
+private:
+    std::string write_kernel_name() const;
+    std::string write_code();
+    std::string write_global_section() const;
+    std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
+    std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
+
+private:
+    GpuKernelComponentGroup _components{};
+    GpuKernelVariableTable  _vtable{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */