aboutsummaryrefslogtreecommitdiff
path: root/src/dynamic_fusion/sketch/gpu/ckw_driver
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/ckw_driver')
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp105
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h121
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp139
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h81
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp68
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h72
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp73
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h72
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h140
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp295
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h68
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp256
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h68
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp361
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h80
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp427
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h85
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp434
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h70
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp287
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h86
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp405
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h78
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp576
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h93
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp144
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h62
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp56
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h65
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp162
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h103
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp57
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h42
33 files changed, 5231 insertions, 0 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
new file mode 100644
index 0000000000..a42b39700c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
+#include "compute_kernel_writer/include/ckw/Error.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand tensor) : _tensor(tensor)
+{
+}
+
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile,
+ const ckw::TensorSampler &sampler)
+{
+ CKW_ASSERT(_tile == nullptr);
+
+ _tile = tile;
+ _sampler = sampler;
+
+ return *this;
+}
+
+bool GpuCkwComponentArgument::has_tensor() const
+{
+ return _tensor.is_valid();
+}
+
+ckw::TensorOperand &GpuCkwComponentArgument::tensor()
+{
+ CKW_ASSERT(_tensor.is_valid());
+
+ return _tensor;
+}
+
+const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const
+{
+ CKW_ASSERT(_tensor.is_valid());
+
+ return _tensor;
+}
+
+bool GpuCkwComponentArgument::has_tile() const
+{
+ return _tile.is_valid();
+}
+
+ckw::TileOperand &GpuCkwComponentArgument::tile()
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _tile;
+}
+
+const ckw::TileOperand &GpuCkwComponentArgument::tile() const
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _tile;
+}
+
+ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler()
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _sampler;
+}
+
+const ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() const
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _sampler;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
new file mode 100644
index 0000000000..7a57c81e5f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+
+#include "compute_kernel_writer/include/ckw/TensorOperand.h"
+#include "compute_kernel_writer/include/ckw/TensorSampler.h"
+#include "compute_kernel_writer/include/ckw/TileOperand.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
+class GpuCkwComponentArgument
+{
+public:
+ /** Default constructor */
+ GpuCkwComponentArgument() = default;
+
+ /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor.
+ *
+ * @param[in] tensor The user tensor.
+ */
+ explicit GpuCkwComponentArgument(ckw::TensorOperand tensor);
+
+ /** Bind the tile and sampler to the tensor argument.
+ *
+ * This method can be used to share a tile and sampler associated to a tensor
+ * among different kernel components. For example, when we create the destination
+ * tile and destination sampler for the first time (root component), this method can be
+ * used to bind these two information to the destination tensor so that the following
+ * simple components know the tile size and how to access the elements from memory.
+ *
+ * @param[in] tile The tile that has been loaded.
+ * @param[in] sampler The tensor sampling information that has been used to load the tile.
+ */
+ GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorSampler &sampler);
+
+ /** Get whether the argument is a user tensor. */
+ bool has_tensor() const;
+
+ /** Get the tensor operand.
+ *
+ * If the tensor is not available, throw an error.
+ */
+ ckw::TensorOperand &tensor();
+
+ /** Get the tensor operand.
+ *
+ * If the tensor is not available, throw an error.
+ */
+ const ckw::TensorOperand &tensor() const;
+
+ /** Get whether the argument contains a tile.
+ *
+ * The argument can be either a user tensor that has been loaded,
+ * or a virtual tensor (i.e. a tile with tensor sampling information).
+ */
+ bool has_tile() const;
+
+ /** Get the tile operand.
+ *
+ * If the tile is not available, throw an error.
+ */
+ ckw::TileOperand &tile();
+
+ /** Get the tile operand.
+ *
+ * If the tile is not available, throw an error.
+ */
+ const ckw::TileOperand &tile() const;
+
+ /** Get the tensor sampling information for the tile.
+ *
+ * If the tile is not available, throw an error.
+ */
+ ckw::TensorSampler &tensor_sampler();
+
+ /** Get the tensor sampling information for the tile.
+ *
+ * If the tile is not available, throw an error.
+ */
+ const ckw::TensorSampler &tensor_sampler() const;
+
+private:
+ ckw::TensorOperand _tensor{};
+ ckw::TileOperand _tile{};
+ ckw::TensorSampler _sampler{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
new file mode 100644
index 0000000000..a0e5e16aa0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/TargetArchitecture.h"
+#include "compute_kernel_writer/include/ckw/types/TargetLanguage.h"
+
+using namespace ckw;
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
+{
+ _components = components;
+
+ // Generate kernel name
+ std::string kernel_name;
+ for (auto &comp : _components)
+ {
+ auto ckw_driver = comp->ckw_component_driver();
+ ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+ kernel_name += ckw_driver->get_name(_components) + "__";
+ }
+
+ // Generate kernel code
+ auto root_writer =
+ KernelWriter::create_instance(ckw::TargetArchitecture::GpuArmMaliValhall, ckw::TargetLanguage::OpenCL);
+ GpuCkwScopedKernelWriter writer(root_writer.get());
+ GpuCkwVariableTable vtable{};
+
+ for (auto &comp : _components)
+ {
+ auto ckw_driver = comp->ckw_component_driver();
+ ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+ ckw_driver->write_component_code(_components, vtable, writer);
+ }
+ auto kernel = root_writer->emit_kernel(kernel_name);
+
+ // Set the kernel name, kernel arguments and source code
+ _kernel_name = kernel_name;
+ _kernel_args = kernel->arguments();
+ _kernel_code = kernel->source_code();
+}
+
+std::string GpuCkwDriver::get_name()
+{
+ return _kernel_name;
+}
+
+std::string GpuCkwDriver::get_code()
+{
+ return _kernel_code;
+}
+
+std::string GpuCkwDriver::get_config_id()
+{
+ std::string id;
+ for (auto &comp : _components)
+ {
+ auto ckw_driver = comp->ckw_component_driver();
+ ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+ id = ckw_driver->get_tuner_id(_components) + "__";
+ }
+ return id;
+}
+
+Window GpuCkwDriver::get_window() const
+{
+ const auto root_comp = _components.get_root_component();
+ ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+ return root_comp->ckw_component_driver()->get_window();
+}
+
+GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
+{
+ GpuKernelArgumentList args{};
+ for (const auto &arg : _kernel_args)
+ {
+ switch (arg.type())
+ {
+ case KernelArgument::Type::TensorStorage:
+ {
+ args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_storage_type()));
+ break;
+ }
+ case KernelArgument::Type::TensorComponent:
+ {
+ args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_component_type()));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported KernelArgument Type");
+ break;
+ }
+ }
+ }
+ return args;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
new file mode 100644
index 0000000000..f8770920b7
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+
+#include "compute_kernel_writer/include/ckw/Kernel.h"
+#include "compute_kernel_writer/include/ckw/KernelArgument.h"
+#include <string>
+
+namespace arm_compute
+{
+/** Forward declarations */
+class Window;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use Kernel Writer to write kernel code
+ * Used by dynamic_fusion module
+ */
+class GpuCkwDriver : public IGpuKernelWriter
+{
+public:
+ /** Default constructor */
+ GpuCkwDriver() = delete;
+ /** Constructor
+ *
+ * @param[in] components Kernel component group from which the kernel will be generated
+ */
+ GpuCkwDriver(const GpuKernelComponentGroup &components);
+ /** Destructor */
+ ~GpuCkwDriver() override = default;
+ /** Generate kernel name */
+ std::string get_name() override;
+ /** Generate kernel code */
+ std::string get_code() override;
+ /** Generate config id string of the entire kernel. This is used for tuning */
+ std::string get_config_id() override;
+ /** Generate execution window */
+ Window get_window() const override;
+ /** Get the flat list of arguments of the kernel*/
+ GpuKernelArgumentList get_kernel_arguments() override;
+
+private:
+ GpuKernelComponentGroup _components{};
+ std::string _kernel_name{};
+ std::vector<ckw::KernelArgument> _kernel_args{};
+ std::string _kernel_code{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
new file mode 100644
index 0000000000..ae12d13e5a
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(ckw::KernelWriter *writer)
+ : _writer(writer), _parent_id_space(writer->id_space())
+{
+ _writer->new_id_space();
+}
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other)
+ : _writer(other._writer), _parent_id_space(other._writer->id_space())
+{
+ _writer->new_id_space();
+}
+
+ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->()
+{
+ return _writer;
+}
+
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() const
+{
+ return _writer;
+}
+
+ckw::KernelWriter *GpuCkwScopedKernelWriter::writer()
+{
+ return _writer;
+}
+
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() const
+{
+ return _writer;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
new file mode 100644
index 0000000000..84dd706cd0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** Helper to automatically manage kernel writer ID space. */
+class GpuCkwScopedKernelWriter
+{
+public:
+ /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */
+ explicit GpuCkwScopedKernelWriter(ckw::KernelWriter *writer);
+
+ /** Create a new scope from the specified scoped kernel writer. */
+ GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other);
+
+ /** Assignment is disallowed. */
+ GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete;
+
+ /** Access the underlying kernel writer. */
+ ckw::KernelWriter *operator->();
+
+ /** Access the underlying kernel writer. */
+ const ckw::KernelWriter *operator->() const;
+
+ /** Get the kernel writer. */
+ ckw::KernelWriter *writer();
+
+ /** Get the kernel writer. */
+ const ckw::KernelWriter *writer() const;
+
+private:
+ ckw::KernelWriter *_writer;
+ int32_t _parent_id_space;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
new file mode 100644
index 0000000000..66ccc1ac34
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include <sstream>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+ GpuCkwScopedKernelWriter &writer,
+ const ITensorInfo *tensor,
+ const std::string &alias)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+
+ // Do not re-declare if the variable associated with the tensor has already been declared
+ auto it = _vars.find(tensor->id());
+
+ if (it != _vars.end())
+ {
+ return &it->second;
+ }
+ if (comp_group.is_intermediate_tensor(tensor))
+ {
+ // Create a virtual tensor variable
+ GpuCkwComponentArgument var;
+ auto &&inserted = _vars.emplace(tensor->id(), var);
+ return &(inserted.first->second);
+ }
+ else
+ {
+ // Create a user tensor variable
+ std::stringstream ss;
+ ss << alias << "_t" << abs(tensor->id());
+ const auto uniq_name = ss.str();
+ GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor))};
+ auto &&inserted = _vars.emplace(tensor->id(), var);
+ return &(inserted.first->second);
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
new file mode 100644
index 0000000000..fc8764c3e2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuKernelComponentGroup;
+class GpuCkwScopedKernelWriter;
+
+/** A table of all the variables used in the kernel.
+ *
+ * It determines whether we create an virtual tensor var or a user tensor var
+ * It avoids duplicating variables for the same tensors (Tensors with the same id)
+ * Each kernel has exactly one variable table.
+ */
+class GpuCkwVariableTable
+{
+public:
+ /** Declare a kernel component variable(argument) for the corresponding tensor info.
+ *
+ * @param[in] comp_group Component group the tensor belongs to
+ * @param[in] writer Compute Kernel Writer
+ * @param[in] tensor Tensor info with which the new variable is associated
+ * @param[in] alias Alias for the variable. Will be used as part of the variable name
+ *
+ * @return GpuCkwComponentArgument*
+ */
+ GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
+ GpuCkwScopedKernelWriter &writer,
+ const ITensorInfo *tensor,
+ const std::string &alias = "unnamed");
+
+private:
+ std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
new file mode 100644
index 0000000000..52e56e2e35
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
+
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuCkwVariableTable;
+class GpuCkwScopedKernelWriter;
+
+/** An interface used by @ref GpuCkwDriver to write source code for a kernel component
+ *
+ * There are 3 main architecture layers for using Compute Kernel Writer (Ckw) inside ACL's dynamic fusion module
+ * From top level to bottom level:
+ * | Layer | Library
+ * ===========================
+ * | dynamic_fusion | acl
+ * | ckw_driver | acl
+ * | ckw | ckw
+ *
+ * ckw_driver is a glue layer that directs how fused code is produced using the ckw library
+ *
+ * There are two main groups within ckw_driver:
+ * - @ref GpuCkwDriver is a global driver that coordinates how the final fused code along with all the info necessary
+ * for run time execution is produced using ckw
+ * - Various classes implementing @ref IGpuCkwComponentDriver is a component driver that directs ckw to generate kernel component code (e.g. activation, store etc.)
+ *
+ * The overall flow goes like this:
+ * In dynamic_fusion module, @ref GpuLogicalKernel instantiates a @ref GpuCkwDriver from a @ref GpuKernelComponentGroup
+ * The logical kernel then uses the global driver's various interfaces to generate the code info.
+ * In particular, the @ref GpuCkwDriver::get_code() interface will call into each @ref IGpuCkwComponentDriver::write_component_code()
+ */
+class IGpuCkwComponentDriver
+{
+public:
+ using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ */
+ IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
+ {
+ }
+ /** Destructor */
+ virtual ~IGpuCkwComponentDriver()
+ {
+ }
+ /** Generate kernel component code
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ * @param[in, out] vtable Table of variables declared by each component
+ * @param[in, out] writer CKW writer that writes code scoped to this kernel component.
+ *
+ * @note @p writer can only be passed via value since the new scope is created in the copy constructor
+ */
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const = 0;
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const
+ {
+ return _tensors;
+ }
+ /** Generate the execution window for the component */
+ virtual Window get_window() const
+ {
+ return Window{};
+ }
+ /** Generate the name of the component
+ *
+ * This will be concatenated with other components' names to form the name of the kernel
+ */
+ virtual std::string get_name(const ComponentGroup &comp_group) const
+ {
+ ARM_COMPUTE_UNUSED(comp_group);
+ return "unnamed";
+ }
+ /** Generate the tuner id of the component
+ * This id should capture all the parameters that distinguish one kernel's lws tuning from another.
+ * e.g. two components that are identical in every other way, but have output tensor dimensions should
+ * have different tuner ids, because the lws of one may not be optimal on the other.
+ *
+ * This will be concatenated with other components' tuner id to form the tuner id of the kernel
+ */
+ virtual std::string get_tuner_id(const ComponentGroup &comp_group) const
+ {
+ ARM_COMPUTE_UNUSED(comp_group);
+ return "";
+ }
+ /** Get component id */
+ ComponentId id() const
+ {
+ return _id;
+ }
+
+private:
+ ComponentId _id{-1};
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
new file mode 100644
index 0000000000..18fda5bd6b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwActivation.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwActivation::GpuCkwActivation(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes) // NOLINT
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+ const auto dst_dt = to_ckw(_dst->data_type());
+
+ // CKW constants
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_neg_1_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt));
+ auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_A_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt));
+ auto const_B_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+
+ // Destination compute block size
+ int32_t dst_n0 = -1;
+ int32_t dst_m0 = -1;
+
+ // Destination compute block size left-over
+ int32_t dst_n0_partial = -1;
+ int32_t dst_m0_partial = -1;
+
+ // Shift-back for the overlapping-min strategy
+ int32_t dst_shift_back = -1;
+
+ if (!dst->has_tile())
+ {
+ // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+ // as tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ dst_n0 = root_window.x().step();
+ dst_m0 = root_window.y().step();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+ }
+ else
+ {
+ // dst_m0_partial depends on the TensorSamplerFormat
+ dst_n0 = dst->tile().tile_info().width();
+ dst_m0 = dst->tile().tile_info().height();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+ if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ }
+ else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ dst_m0_partial = _dst->dimension(1) % dst_m0;
+ }
+
+ // Shift-back for the overlapping-min strategy
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+ }
+
+ const auto &tile_dst = dst->tile();
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_dst_shift_back_n0 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ if (!src->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+ auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 =
+ writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0,
+ const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0);
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+
+ auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+ src->init_virtual_tensor(tile_src, sampler_src);
+ }
+
+ const auto &tile_src = src->tile();
+
+ /********************************************************************************
+ * 7 - Write the rest of the code
+ ********************************************************************************/
+ switch (_attributes.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ {
+ // dst = src * -1
+ writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp);
+ // dst = exp(src * -1)
+ writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst);
+ // dst = 1 + (exp(src * -1))
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp);
+ // dst = 1 / 1 + (exp(src * -1))
+ writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ {
+ writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_src);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ {
+ // dst = max(src, 0)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ {
+ //dst = max(src, 0)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
+ //dst = min(max(src, 0), A_VAL)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ {
+ //dst = max(src, B_VAL)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp);
+ //dst = min(max(src, B_VAL), A_VAL)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
+ break;
+ }
+ default:
+ CKW_ASSERT(false);
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwActivation::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+ // This is in line with the collapsing convention used by operators like Conv2d
+ output_shape.collapse(2U, 1U);
+ constexpr uint32_t vector_size_byte_opencl = 16;
+ const uint32_t num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+ Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
new file mode 100644
index 0000000000..386e933a72
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwActivation : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentActivation::Attributes;
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref GpuCkwActivation::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation);
+ /** Destructor */
+ ~GpuCkwActivation() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
new file mode 100644
index 0000000000..d3e0dbafd4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwCast.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+ ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false,
+ "The source data type must be a floating-point data type");
+}
+
+void GpuCkwCast::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+
+ // CKW constants
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+
+ // Destination compute block size
+ int32_t dst_n0 = -1;
+ int32_t dst_m0 = -1;
+
+ // Destination compute block size left-over
+ int32_t dst_n0_partial = -1;
+ int32_t dst_m0_partial = -1;
+
+ // Shift-back for the overlapping-min strategy
+ int32_t dst_shift_back = -1;
+
+ if (!dst->has_tile())
+ {
+ // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+ // as tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ dst_n0 = root_window.x().step();
+ dst_m0 = root_window.y().step();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ ckw::DataType dst_dt = to_ckw(_dst->data_type());
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+ }
+ else
+ {
+ // Change dst_n0 and dst_m0 if NOT root component!
+ // ATTENTION:
+ // dst_m0_partial depends on the TensorSamplerFormat
+ dst_n0 = dst->tile().tile_info().width();
+ dst_m0 = dst->tile().tile_info().height();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+ if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ }
+ else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ dst_m0_partial = _dst->dimension(1) % dst_m0;
+ }
+
+ // Shift-back for the overlapping-min strategy
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+ }
+
+ const auto &tile_dst = dst->tile();
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_dst_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ if (!src->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+ const_dst_shift_back_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+ ckw::DataType src_dt = to_ckw(_src->data_type());
+ auto tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+ src->init_virtual_tensor(tile_src, sampler_src);
+ }
+
+ auto tile_src = src->tile();
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (optional)
+ ********************************************************************************/
+
+ // Not required
+
+ /********************************************************************************
+ * 7 - Write the rest of the code
+ ********************************************************************************/
+ // Only None ConvertPolicy is supported for floating-point data types
+ ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
+
+ writer->op_cast(tile_dst, tile_src, convert_policy);
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwCast::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+ // This is in line with the collapsing convention used by operators like Conv2d
+ output_shape.collapse(2U, 1U);
+ constexpr uint32_t vector_size_byte_opencl = 16;
+ const uint32_t num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+ Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
new file mode 100644
index 0000000000..2389301196
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwCast : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentCast::Attributes;
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentCast::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast);
+ /** Destructor */
+ ~GpuCkwCast() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..cfccab186b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+ {
+ _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+ }
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst);
+}
+
+void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ // Data Layout is NHWC
+ const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+ GpuCkwComponentArgument *bia = nullptr;
+
+ const bool using_bias = _bia != nullptr;
+
+ if (using_bias)
+ {
+ bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+ }
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+ const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx));
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+ const auto stride_x = static_cast<int32_t>(_attributes.stride().x());
+ const auto stride_y = static_cast<int32_t>(_attributes.stride().y());
+ const auto pad_x = static_cast<int32_t>(_attributes.pad().left);
+ const auto pad_y = static_cast<int32_t>(_attributes.pad().top);
+ const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier());
+ const auto dilation_x = static_cast<int32_t>(_attributes.dilation().x());
+ const auto dilation_y = static_cast<int32_t>(_attributes.dilation().y());
+ const auto kernel_size = kernel_width * kernel_height;
+
+ // CKW constants
+ auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+ auto const_kernel_size_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+ auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+ auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+ auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+ auto const_depth_multiplier_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32));
+ auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32));
+ auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ const int32_t src_m0 = kernel_width + (dst_m0 - 1);
+ const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0;
+ const int32_t wei_m0 = kernel_width;
+ const int32_t wei_n0 = dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize the destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensors
+ ********************************************************************************/
+ // SOURCE SAMPLER
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // WEIGHTS SAMPLER
+ // We cannot have out-of-bounds accesses for the weights
+ ckw::TensorSampler sampler_wei;
+ sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ if (_settings.export_weights_to_cl_image())
+ {
+ sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+ }
+ else
+ {
+ sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+ }
+
+ // BIAS SAMPLER
+ ckw::TensorSampler sampler_bia;
+ sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+ sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (Optional)
+ ********************************************************************************/
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32);
+ writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32);
+
+ auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32);
+ writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+ writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32);
+
+ auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32);
+ writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+ writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32);
+
+ // Loop variables
+ auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32);
+
+ writer->op_assign(tile_yk, const_0_i32);
+
+ // clang-format off
+ writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32,
+ [&]()
+ {
+ auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0));
+ auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0));
+
+ writer->op_assign(tile_src, const_0_fp);
+
+ auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_x_lt_w = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_y_lt_h = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32));
+
+ // Check if yi + yk * DILATION_Y is out-of-bound
+ writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32);
+ writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32);
+
+ auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32));
+
+ // Load src
+ for(int32_t xk = 0; xk < src_m0; ++xk)
+ {
+ auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32));
+
+ // xi + xk * DILATION_X
+ writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32);
+ writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi);
+
+ // Check if xi + xk * DILATION_X is out-of-bound
+ writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32);
+ writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32);
+
+ // Set mi to -1 if we have out-of-bound memory accesses
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0);
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w);
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0);
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h);
+
+ writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0);
+ }
+
+ // Load wei
+ writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32);
+
+ // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW
+ // Therefore, this part should be replaced with the MAC ternary operator when availabe
+ auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+ for(int32_t m0 = 0; m0 < dst_m0; ++m0)
+ {
+ for(int32_t xk = 0; xk < kernel_width; ++xk)
+ {
+ auto tile_a = tile_src.row(m0 + xk);
+ auto tile_b = tile_wei.row(xk);
+ auto tile_c = tile_dst.row(m0);
+
+ writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b);
+ writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp);
+ }
+ }
+ writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32);
+ });
+ // clang-format on
+
+ // Bias addition
+ // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+ // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+ if (using_bias)
+ {
+ if (!bia->has_tile())
+ {
+ auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+ writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32);
+ bia->init_virtual_tensor(tile_bia, sampler_bia);
+ }
+ auto &tile_bia = bia->tile();
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwDepthwiseConv2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+ TensorShape output_shape = _dst->tensor_shape();
+
+ Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0()));
+ return win.collapse(win, Window::DimZ);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
new file mode 100644
index 0000000000..a15d3ee710
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+class GpuCkwDepthwiseConv2d : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentDepthwiseConv2d::Attributes;
+ using Settings = ClComponentDepthwiseConv2d::Settings;
+
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentDepthwiseConv2d::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ */
+ GpuCkwDepthwiseConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDepthwiseConv2d);
+ /** Destructor */
+ ~GpuCkwDepthwiseConv2d() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_wei;
+ const ITensorInfo *_bia;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
new file mode 100644
index 0000000000..eb4f644eb6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+using TileContainer = std::vector<std::vector<int32_t>>;
+
+GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
+}
+
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const auto desc = _settings.direct_conv_descriptor();
+ ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
+ "Only the weights tensor can be exported to cl_image");
+
+ const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+ const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+ GpuCkwComponentArgument *bia = nullptr;
+
+ const bool using_bias = _bia != nullptr;
+
+ if (using_bias)
+ {
+ bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+ }
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+ const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx));
+ const auto src_c = static_cast<int32_t>(_src->dimension(channel_idx));
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_w = static_cast<int32_t>(_dst->dimension(width_idx));
+ const auto stride_x = static_cast<int32_t>(_attributes.stride().x());
+ const auto stride_y = static_cast<int32_t>(_attributes.stride().y());
+ const auto pad_x = static_cast<int32_t>(_attributes.pad().left);
+ const auto pad_y = static_cast<int32_t>(_attributes.pad().top);
+ const auto kernel_size = kernel_width * kernel_height;
+ const auto k0 =
+ static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)));
+
+ // CKW constants
+ auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+ auto const_kernel_size_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+ auto const_src_c_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32));
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32));
+ auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+ auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+ auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+ auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+ auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_src_c_i32_minus_k0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensors
+ ********************************************************************************/
+ // Exporting the weights tensor to an OpenCL image object is currently only supported when:
+ // a) k0 is equal to 4
+ // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
+ // b) K is a multiple of 4
+ // This is a limitation in the current interface due to the variable table being responsible for maintaining
+ // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
+ // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
+ // extra loop to compute the left-over elements.
+ const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0);
+
+ // SOURCE SAMPLER
+ // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+ // compute left-over elements
+ // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the
+ // indirection buffer mi does not contain negative values representing out-of-bounds reads.
+ auto address_mode_y_src =
+ kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero;
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(address_mode_y_src);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // WEIGHTS SAMPLER
+ // We cannot have out-of-bounds accesses for the weights
+ ckw::TensorSampler sampler_wei;
+ sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+ sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ if (use_cl_image_for_weights)
+ {
+ sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+ }
+ else
+ {
+ sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+ }
+
+ // BIAS SAMPLER
+ ckw::TensorSampler sampler_bia;
+
+ if (using_bias)
+ {
+ sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+ sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+ }
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (optional)
+ ********************************************************************************/
+
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT
+ auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32);
+ get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ // We create a 2d container of size (dst_m0, 1) to store the indices for iteration
+ TileContainer it;
+ for (int32_t m = 0; m < dst_m0; ++m)
+ {
+ std::vector<int32_t> idx{m};
+ it.push_back({idx});
+ }
+
+ const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32));
+
+ auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+ // Convert the linear index to coordinate
+ // xi = ((mout + i) % dst_w) * stride_x - pad_x
+ // yi = ((mout + i) / dst_w) * stride_y - pad_y
+ writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+ writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32);
+ writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32);
+ writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32);
+
+ auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32));
+ writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32);
+
+ auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32));
+ writer->op_assign(tile_i, const_0_i32);
+
+ // clang-format off
+ writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+ {
+ auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32);
+ writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32);
+
+ auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32));
+ writer->op_assign(tile_ck, const_0_i32);
+
+ // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
+ // x_s = xi + x_k
+ // y_s = yi + y_k
+ // mi = x_s + y_s * width;
+ // mi = select(-1, mi, x_s >= 0);
+ // mi = select(-1, mi, x_s < width);
+ // mi = select(-1, mi, y_s >= 0);
+ // mi = select(-1, mi, y_s < height);
+ auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+ auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_xs_lt_w = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_ys_lt_h = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+ writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k);
+ writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k);
+ writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32);
+ writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs);
+ writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32);
+ writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32);
+ writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32);
+ writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h);
+
+ writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0));
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+ writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+ });
+
+ // Left-over accumulations for when K is not a multiple of k0
+ if(((src_c % k0) != 0))
+ {
+ writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1));
+ auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+ writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+ });
+ }
+
+ writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32);
+ });
+ // clang-format on
+
+ // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of
+ // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+ if (using_bias)
+ {
+ if (!bia->has_tile())
+ {
+ auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+ writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32);
+ bia->init_virtual_tensor(tile_bia, sampler_bia);
+ }
+ auto &tile_bia = bia->tile();
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwDirectConv2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const auto dst_shape = _dst->tensor_shape();
+ const auto desc = _settings.direct_conv_descriptor();
+
+ const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+ const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+
+ Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0));
+
+ const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0);
+ win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0));
+ win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
+
+ return win;
+}
+
+std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ return "direct_conv2d";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
new file mode 100644
index 0000000000..139cf620e2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwDirectConv2d : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentDirectConv2d::Attributes;
+ using Settings = ClComponentDirectConv2d::Settings;
+
+public:
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentDirectConv2d::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+ * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component
+ */
+ GpuCkwDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
+ /** Destructor */
+ ~GpuCkwDirectConv2d() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_wei;
+ const ITensorInfo *_bia;
+ const ITensorInfo *_dst;
+
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
new file mode 100644
index 0000000000..fb55acad53
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwElementwiseBinary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/ConstantData.h"
+#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes)
+ : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
+{
+ _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+ GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+
+ // CKW constants
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+
+ // Destination compute block size
+ int32_t dst_n0 = -1;
+ int32_t dst_m0 = -1;
+
+ // Destination compute block size left-over
+ int32_t dst_n0_partial = -1;
+ int32_t dst_m0_partial = -1;
+
+ if (!dst->has_tile())
+ {
+ // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+ // as tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ dst_n0 = root_window.x().step();
+ dst_m0 = root_window.y().step();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ ckw::DataType dst_dt = to_ckw(_dst->data_type());
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+ }
+ else
+ {
+ // Change dst_n0 and dst_m0 if NOT root component!
+ dst_n0 = dst->tile().tile_info().width();
+ dst_m0 = dst->tile().tile_info().height();
+
+ // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial
+ // because if we enter this condition it means that the element-wise op is not the
+ // root component and the address modes have been already set.
+ }
+
+ const auto &tile_dst = dst->tile();
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // ...
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensors
+ ********************************************************************************/
+ // Check whether the lhs tensor is a tile or tensor
+ // If it is a tile, create a sampler and load the content in a tile
+ if (!lhs->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_lhs = dst->tensor_sampler();
+
+ bool broadcast_x = false;
+ bool broadcast_y = false;
+
+ int32_t lhs_n0 = dst_n0;
+ int32_t lhs_m0 = dst_m0;
+
+ // Check whether we have broadcasting
+ // In case of broadcast, lhs can only be a vector or scalar.
+ // Broadcasting in other dimensions is not supported
+ if (_dst->dimension(0) != _lhs->dimension(0))
+ {
+ broadcast_x = true;
+ lhs_n0 = 1;
+ }
+
+ if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2))
+ {
+ broadcast_y = true;
+ lhs_m0 = 1;
+ }
+ }
+ else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ if (_dst->dimension(1) != _lhs->dimension(1))
+ {
+ broadcast_y = true;
+ lhs_m0 = 1;
+ }
+ }
+
+ const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0;
+ const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0;
+
+ // Constants
+ auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32));
+ auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32));
+ auto const_lhs_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32));
+
+ auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 =
+ writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ if (!broadcast_x)
+ {
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32,
+ const_lhs_shift_back_n0_i32, const_0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_cout0, const_0_i32);
+ }
+
+ if (!broadcast_y)
+ {
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_mout0, const_0_i32);
+ }
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ // For tile_mout1 and tile_bout0 the step can only be 1
+ if (!broadcast_y)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ }
+ else
+ {
+ // If broadcast_y == true, it means that we have either a scalar or vector
+ // because broadcasting in other dimensions is not supported
+ writer->op_assign(tile_mout1, const_0_i32);
+ }
+
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+
+ ckw::DataType lhs_dt = to_ckw(_lhs->data_type());
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0));
+
+ writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement
+ lhs->init_virtual_tensor(tile_lhs, sampler_lhs);
+ }
+
+ // Check whether the rhs tensor is a tile or tensor
+ // If it is a tile, create a sampler and load the content in a tile
+ if (!rhs->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_rhs = dst->tensor_sampler();
+
+ bool broadcast_x = false;
+ bool broadcast_y = false;
+
+ int32_t rhs_n0 = dst_n0;
+ int32_t rhs_m0 = dst_m0;
+
+ // Check whether we have broadcasting
+ // In case of broadcast, rhs can only be a vector or scalar.
+ // Broadcasting in other dimensions is not supported
+ if (_dst->dimension(0) != _rhs->dimension(0))
+ {
+ broadcast_x = true;
+ rhs_n0 = 1;
+ }
+
+ if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2))
+ {
+ broadcast_y = true;
+ rhs_m0 = 1;
+ }
+ }
+ else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ if (_dst->dimension(1) != _rhs->dimension(1))
+ {
+ broadcast_y = true;
+ rhs_m0 = 1;
+ }
+ }
+
+ const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0;
+ const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0;
+
+ // Constants
+ auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32));
+ auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32));
+ auto const_rhs_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32));
+
+ auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 =
+ writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ if (!broadcast_x)
+ {
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32,
+ const_rhs_shift_back_n0_i32, const_0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_cout0, const_0_i32);
+ }
+
+ if (!broadcast_y)
+ {
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_mout0, const_0_i32);
+ }
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ // For tile_mout1 and tile_bout0 the step can only be 1
+ const auto src_w = static_cast<int32_t>(_rhs->dimension(1));
+ auto const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ if (!broadcast_y)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w);
+ }
+ else
+ {
+ // If broadcast_y == true, it means that we have either a scalar or vector
+ // because broadcasting in other dimensions is not supported
+ writer->op_assign(tile_mout1, const_0_i32);
+ }
+
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w);
+ }
+
+ ckw::DataType rhs_dt = to_ckw(_rhs->data_type());
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0));
+
+ writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement
+ rhs->init_virtual_tensor(tile_rhs, sampler_rhs);
+ }
+
+ const auto &tile_lhs = lhs->tile();
+ const auto &tile_rhs = rhs->tile();
+
+ /********************************************************************************
+ * 7 - Write the rest of the code
+ ********************************************************************************/
+ // Perform the element-wise operation
+ writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs);
+
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwElementwiseBinary::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+ // This is in line with the collapsing convention used by operators like Conv2d
+ output_shape.collapse(2U, 1U);
+ constexpr uint32_t vector_size_byte_opencl = 16;
+ const uint32_t num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+ Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
+std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+ const std::vector<std::string> build_params = {
+ "elementwise_binary",
+ "op",
+ to_string(_attributes.operation()),
+ "dt",
+ lower_string(string_from_data_type(_dst->data_type())),
+ };
+ return join(build_params, "_");
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
new file mode 100644
index 0000000000..c6cbba28d3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwElementwiseBinary : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentElementwiseBinary::Attributes;
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentElementwiseBinary::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary);
+ /** Destructor */
+ ~GpuCkwElementwiseBinary() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_lhs;
+ const ITensorInfo *_rhs;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
new file mode 100644
index 0000000000..14ad3847fc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "support/StringSupport.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwMatMul::GpuCkwMatMul(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+ _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+ GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto k =
+ _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0));
+ const auto k0 = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k));
+ const auto dst_dt = to_ckw(_dst->data_type());
+
+ // CKW constants
+ auto const_k_i32 = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32));
+ auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensors
+ ********************************************************************************/
+ // LHS SAMPLER
+ // The assumption here is that M is multiple of M0. This limitation will be removed once
+ // we have the support for OverlappingMin as address mode for the Y direction
+ ckw::TensorSampler sampler_lhs;
+ sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // RHS SAMPLER
+ ckw::TensorSampler sampler_rhs;
+ sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (optional)
+ ********************************************************************************/
+
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index
+ auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index
+ auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32);
+ get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index
+
+ writer->op_assign(tile_idx_k, const_0_i32);
+
+ // clang-format off
+ writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32,
+ [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0));
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+ writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+
+ });
+
+ // Left-over accumulations for when K is not a multiple of k0
+ if(((k % k0) != 0))
+ {
+ writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1));
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+ writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+ });
+ }
+ // clang-format on
+}
+
+Window GpuCkwMatMul::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const int32_t m = _dst->dimension(1);
+ const int32_t n = _dst->dimension(0);
+ const bool adj_lhs = _attributes.adj_lhs();
+
+ const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+ const int32_t n0 = adjust_vec_size(_settings.n0(), n);
+
+ // Configure kernel window
+ Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
+ win = win.collapse(win, Window::DimZ);
+
+ return win;
+}
+
+std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ std::string kernel_name("mat_mul_native");
+
+ const int32_t m = _dst->dimension(1);
+ const int32_t n = _dst->dimension(0);
+ const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+
+ kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
+ kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(m);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(n);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(k);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_dst->dimension(2));
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.m0());
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.n0());
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.k0());
+
+ return kernel_name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
new file mode 100644
index 0000000000..790418bf50
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwMatMul final : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentMatMul::Attributes;
+ using Settings = ClComponentMatMul::Settings;
+
+public:
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentMatMul::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+ * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component
+ */
+ GpuCkwMatMul(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwMatMul);
+
+ /** Destructor */
+ ~GpuCkwMatMul() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_lhs;
+ const ITensorInfo *_rhs;
+ const ITensorInfo *_dst;
+
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
new file mode 100644
index 0000000000..d027f348ef
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwPool2d::GpuCkwPool2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
+
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const uint32_t width_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x());
+ const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y());
+ const auto pad_x = static_cast<int32_t>(_attributes.pad().left);
+ const auto pad_y = static_cast<int32_t>(_attributes.pad().top);
+ const auto stride_x = static_cast<int32_t>(_attributes.stride().x());
+ const auto stride_y = static_cast<int32_t>(_attributes.stride().y());
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+
+ // CKW constants
+ auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32));
+ auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32));
+ auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+ auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+ auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+ auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_lowest_val_fp =
+ writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32));
+ auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+ // Check if it is global pooling
+ const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0);
+
+ // Accumulate always in F32 if the pool type is not MAX
+ const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) ||
+ ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX);
+
+ const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16;
+
+ const bool is_wider_acc = dst_dt != acc_dt;
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling.
+ auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
+
+ // Initialise result tile with appropriate value
+ if (_attributes.pool_type() == PoolingType::MAX)
+ {
+ if (_settings.use_inf_as_limit())
+ {
+ writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None);
+ }
+ else
+ {
+ writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None);
+ }
+ }
+ else
+ {
+ writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None);
+ }
+
+ // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X
+ auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+ writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32);
+
+ // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y
+ auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+ writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32);
+
+ auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32);
+ auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32);
+
+ writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start);
+ writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start);
+
+ // int pool_x_s = max((int)0, -idx_in_w);
+ // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+ // int pool_y_s = max((int)0, -idx_in_h);
+ // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+ auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
+ auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
+ auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+ auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
+
+ writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start);
+ writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start);
+ writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e);
+ writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start);
+ writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start);
+ writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e);
+
+ // #if defined(EXCLUDE_PADDING)
+ // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+ // #else // defined(EXCLUDE_PADDING)
+ // int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+ // #endif // defined(EXCLUDE_PADDING)
+ auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
+ if (_attributes.exclude_padding())
+ {
+ auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+ auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
+
+ writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s);
+ writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s);
+ writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff);
+ }
+ else
+ {
+ writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32);
+ }
+
+ auto tile_x = writer->declare_tile("x", ckw::DataType::Int32);
+ auto tile_y = writer->declare_tile("y", ckw::DataType::Int32);
+
+ if (is_global_pooling)
+ {
+ writer->op_assign(tile_y, const_0_i32);
+ writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_y, tile_pool_y_s);
+ }
+
+ // Y dim for-loop
+ writer->op_for_loop(
+ tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32,
+ [&]()
+ {
+ // Reset the iterator for the inner loop
+ if (is_global_pooling)
+ {
+ writer->op_assign(tile_x, const_0_i32);
+ writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_x, tile_pool_x_s);
+ }
+
+ auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y);
+
+ // X dim for-loop
+ writer->op_for_loop(
+ tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32,
+ [&]()
+ {
+ auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x);
+
+ ckw::DataType src_dt = to_ckw(_src->data_type());
+ auto tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
+
+ // Load src tile
+ if (is_wider_acc)
+ {
+ auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+ writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+ tile_src_coord_y, tile_bout0);
+ writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None);
+ }
+ else
+ {
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+ tile_src_coord_y, tile_bout0);
+ }
+
+ // Take the square of the input, for L2 Pooling
+ if (_attributes.pool_type() == PoolingType::L2)
+ {
+ writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src);
+ }
+
+ // Perfom Pooling op
+ if (_attributes.pool_type() == PoolingType::MAX)
+ {
+ writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src);
+ }
+ else
+ {
+ writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src);
+ }
+ });
+ });
+
+ if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
+ {
+ // Filter_size is automatically broadcasted in the operation
+ auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt));
+ writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp);
+ }
+
+ // Take square root of the result in L2 pooling
+ if (_attributes.pool_type() == PoolingType::L2)
+ {
+ writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res);
+ }
+
+ // Store the results and do casting if mixed precision
+ if (is_wider_acc)
+ {
+ writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None);
+ }
+ else
+ {
+ writer->op_assign(tile_dst, tile_res);
+ }
+}
+
+Window GpuCkwPool2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ const uint32_t vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+ // Create and configure kernel window
+ auto win = calculate_max_window(output_shape, Steps(vec_size));
+ win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
+ return win;
+}
+
+std::string GpuCkwPool2d::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ return "pool2dMxN";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
new file mode 100644
index 0000000000..822282a108
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwPool2d : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentPool2d::Attributes;
+ using Settings = ClComponentPool2d::Settings;
+
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentCast::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ */
+ GpuCkwPool2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwPool2d);
+ /** Destructor */
+ ~GpuCkwPool2d() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
new file mode 100644
index 0000000000..edd7ea9a38
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr uint32_t opencl_vector_size_in_bytes = 16;
+} // namespace
+
+GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const uint32_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+ _attributes.align_corners());
+ const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+ _attributes.align_corners());
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+
+ // CKW constants
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+ auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+ auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+
+ // dst_m0 must be 1
+ ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensor
+ ********************************************************************************/
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+
+ // ....
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+ const_0_i32);
+ writer->op_assign(tile_xo, tile_gid_1);
+ writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+ auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+ switch (_attributes.sampling_policy())
+ {
+ case SamplingPolicy::TOP_LEFT:
+ // xi_f = (xo * scale_x)
+ // yi_f = (yo * scale_y)
+ writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
+ break;
+ case SamplingPolicy::CENTER:
+ {
+ // xi_f = ((xo + 0.5f) * scale_x)
+ // yi_f = ((yo + 0.5f) * scale_y)
+ const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+ const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+ writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported sampling policy");
+ }
+
+ if (_attributes.align_corners())
+ {
+ writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f);
+ writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f);
+ }
+
+ // xi0 = clamp((int)xi_f, 0, (int)src_w - 1)
+ // yi0 = clamp((int)yi_f, 0, (int)src_h - 1)
+ auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
+ auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
+
+ writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
+
+ auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+ auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+
+ writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+ writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+ auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+ auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+
+ writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1);
+ writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1);
+
+ auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0));
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+
+ writer->op_assign(tile_dst, tile_src);
+}
+
+void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+ _attributes.align_corners());
+ const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+ _attributes.align_corners());
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+
+ // CKW constants
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32));
+ auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+ auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+ auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+
+ // dst_m0 must be 1
+ ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+
+ // ....
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+ const_0_i32);
+ writer->op_assign(tile_xo, tile_gid_1);
+ writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+ auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+ switch (_attributes.sampling_policy())
+ {
+ case SamplingPolicy::TOP_LEFT:
+ // xi_f = (xo * scale_x)
+ // yi_f = (yo * scale_y)
+ writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
+ break;
+ case SamplingPolicy::CENTER:
+ {
+ // xi_f = ((xo + 0.5f) * scale_x - 0.5f)
+ // yi_f = ((yo + 0.5f) * scale_y - 0.5f)
+ const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+ const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+ writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
+
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported sampling policy");
+ }
+
+ // xi = (int)floor(xi_f);
+ // yi = (int)floor(yi_f);
+ auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
+ auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
+ writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f);
+ writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f);
+
+ auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
+ auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
+ writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
+
+ // xi0 = clamp(xi, 0, (int)src_w - 1);
+ // yi0 = clamp(yi, 0, (int)src_h - 1);
+ // xi1 = clamp(xi + 1, 0, (int)src_w - 1);
+ // yi1 = clamp(yi + 1, 0, (int)src_h - 1);
+ auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+ auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+ writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+ writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+ auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
+ auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
+ writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32);
+ writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32);
+
+ auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+ auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+ auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
+ auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
+
+ writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1);
+ writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1);
+ writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1);
+ writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1);
+
+ auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0));
+ auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0));
+ auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0));
+ auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+ writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+ writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo);
+ writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo);
+ writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo);
+
+ // Weights of each nearest pixel
+ auto tile_a = writer->declare_tile("a", ckw::DataType::Fp32);
+ auto tile_b = writer->declare_tile("b", ckw::DataType::Fp32);
+ auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
+ auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
+
+ // a = (xi_f - (float)xi)
+ // b = (1.f - a)
+ // a1 = (yi_f - (float)yi)
+ // b1 = (1.f - a1)
+ auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
+ auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
+ writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
+
+ writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float);
+ writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a);
+ writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float);
+ writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1);
+
+ // Cast weights to source type
+ const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
+ const auto &tile_b_src_type = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
+ const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
+ const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
+
+ writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
+
+ // in00 * b * b1
+ writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type);
+ writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type);
+
+ // in01 * a * b1
+ writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type);
+ writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type);
+
+ // in10 * b * a1
+ writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type);
+ writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type);
+
+ // in11 * a * a1
+ writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type);
+ writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type);
+
+ // Summation of above terms
+ writer->op_assign(tile_dst, tile_in00);
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01);
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10);
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11);
+}
+
+void GpuCkwResize::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ switch (_attributes.interpolation_policy())
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ do_nearest_neighbor_resize(comp_group, vtable, writer);
+ break;
+ case InterpolationPolicy::BILINEAR:
+ do_bilinear_resize(comp_group, vtable, writer);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported interpolation policy");
+ }
+}
+
+Window GpuCkwResize::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const uint32_t n0 = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
+ Window win = calculate_max_window(*_dst, Steps(n0));
+ return win.collapse(win, Window::DimZ);
+}
+
+std::string GpuCkwResize::get_tuner_id(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ std::string tuner_id = "resize_";
+ tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+ tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+ tuner_id += "_";
+ tuner_id += _attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft";
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(0));
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(1));
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(2));
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(3));
+
+ return tuner_id;
+}
+
+std::string GpuCkwResize::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ std::string name = "resize_";
+ name += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+ name += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+
+ return name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
new file mode 100644
index 0000000000..1266c05921
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwResize final : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentResize::Attributes;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwResize);
+
+ /** Destructor */
+ ~GpuCkwResize() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+ std::string get_tuner_id(const ComponentGroup &comp_group) const override;
+
+private:
+ /** Resize using nearest neighbor interpolation
+ *
+ * @param[in] comp_group Component group to which this component belongs to
+ * @param[in, out] vtable Table of variables declared by this component
+ * @param[in, out] writer CKW writer that writes code scoped to this kernel component
+ */
+ void do_nearest_neighbor_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const;
+
+ /** Resize using bilinear interpolation
+ *
+ * @param[in] comp_group Component group to which this component belongs to
+ * @param[in, out] vtable Table of variables declared by this component
+ * @param[in, out] writer CKW writer that writes code scoped to this kernel component
+ */
+ void do_bilinear_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const;
+
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
new file mode 100644
index 0000000000..d9d741fea5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwStore.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+void GpuCkwStore::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(2));
+
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ const auto &tile_src = src->tile();
+ auto &sampler_src = src->tensor_sampler();
+
+ const auto dst_n0 = static_cast<int32_t>(tile_src.tile_info().width());
+ const auto dst_m0 = static_cast<int32_t>(tile_src.tile_info().height());
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensor
+ ********************************************************************************/
+ // Not required
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+ const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32);
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ // For tile_mout1 and tile_bout0 the step can only be 1
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+}
+
+std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+ return "store";
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
new file mode 100644
index 0000000000..c9ce7eb269
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwStore : public IGpuCkwComponentDriver
+{
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ */
+ GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwStore);
+ /** Destructor */
+ ~GpuCkwStore() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
new file mode 100644
index 0000000000..1e6f0841ad
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CkwHelper.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step)
+{
+ writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+}
+
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step,
+ ckw::TileOperand &shift_back,
+ ckw::TileOperand &const_0)
+{
+ // Applied formula: max((gid * step) - shift_back, 0)
+ // where the shift_back operand is: (step - leftover_step) % step
+
+ writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+ writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back);
+ writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
new file mode 100644
index 0000000000..956e7c8ecb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Get coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out] coord Resultant coordinate
+ * @param[in] gid Global work item id
+ * @param[in] step Step size / vector size
+ */
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step);
+
+/** Get boundary aware coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out] coord Resultant coordinate
+ * @param[in] gid Global work item id
+ * @param[in] step Step size / vector size
+ * @param[in] shift_back It is (step - leftover_step) % step
+ * @param[in] const_0 Constant tile of value 0
+ */
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step,
+ ckw::TileOperand &shift_back,
+ ckw::TileOperand &const_0);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
new file mode 100644
index 0000000000..ad31b06362
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::DataType to_ckw(DataType dt)
+{
+ switch (dt)
+ {
+ case DataType::F32:
+ return ckw::DataType::Fp32;
+ case DataType::F16:
+ return ckw::DataType::Fp16;
+ case DataType::S32:
+ return ckw::DataType::Int32;
+ case DataType::S16:
+ return ckw::DataType::Int16;
+ case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
+ return ckw::DataType::Int8;
+ case DataType::U32:
+ return ckw::DataType::Uint32;
+ case DataType::U16:
+ return ckw::DataType::Uint16;
+ case DataType::U8:
+ case DataType::QASYMM8:
+ return ckw::DataType::Uint8;
+ default:
+ return ckw::DataType::Unknown;
+ }
+}
+
+ckw::TensorShape to_ckw(const TensorShape &shape)
+{
+ ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+ ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
+ /// NOTE: Overflow danger. Use size_t?
+ return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+ static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+ static_cast<int32_t>(shape[4])};
+}
+
+ckw::TensorDataLayout to_ckw(DataLayout dl)
+{
+ switch (dl)
+ {
+ case DataLayout::NHWC:
+ return ckw::TensorDataLayout::Nhwc;
+ case DataLayout::NDHWC:
+ return ckw::TensorDataLayout::Ndhwc;
+ default:
+ return ckw::TensorDataLayout::Unknown;
+ }
+}
+
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
+{
+ return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+ to_ckw(tensor_info.data_layout()), tensor_info.id()};
+}
+
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+ switch (storage)
+ {
+ case TensorStorageType::ClBufferUint8Ptr:
+ return ckw::TensorStorageType::BufferUint8Ptr;
+ case TensorStorageType::ClImage2dReadOnly:
+ return ckw::TensorStorageType::Texture2dReadOnly;
+ case TensorStorageType::ClImage2dWriteOnly:
+ return ckw::TensorStorageType::Texture2dWriteOnly;
+ case TensorStorageType::Unknown:
+ return ckw::TensorStorageType::Unknown;
+ default:
+ ARM_COMPUTE_ERROR("Unknown tensor storage type");
+ }
+}
+
+TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+ switch (component)
+ {
+ case ckw::TensorComponentType::OffsetFirstElement:
+ return TensorComponentType::OffsetFirstElement;
+ case ckw::TensorComponentType::Stride0:
+ return TensorComponentType::Stride0;
+ case ckw::TensorComponentType::Stride1:
+ return TensorComponentType::Stride1;
+ case ckw::TensorComponentType::Stride2:
+ return TensorComponentType::Stride2;
+ case ckw::TensorComponentType::Stride3:
+ return TensorComponentType::Stride3;
+ case ckw::TensorComponentType::Stride4:
+ return TensorComponentType::Stride4;
+ case ckw::TensorComponentType::Dim0:
+ return TensorComponentType::Dim0;
+ case ckw::TensorComponentType::Dim1:
+ return TensorComponentType::Dim1;
+ case ckw::TensorComponentType::Dim2:
+ return TensorComponentType::Dim2;
+ case ckw::TensorComponentType::Dim3:
+ return TensorComponentType::Dim3;
+ case ckw::TensorComponentType::Dim4:
+ return TensorComponentType::Dim4;
+ case ckw::TensorComponentType::Dim1xDim2:
+ return TensorComponentType::Dim1xDim2;
+ case ckw::TensorComponentType::Dim2xDim3:
+ return TensorComponentType::Dim2xDim3;
+ case ckw::TensorComponentType::Dim1xDim2xDim3:
+ return TensorComponentType::Dim1xDim2xDim3;
+ case ckw::TensorComponentType::Unknown:
+ return TensorComponentType::Unknown;
+ default:
+ ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+ }
+}
+
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+ switch (storage)
+ {
+ case ckw::TensorStorageType::BufferUint8Ptr:
+ return TensorStorageType::ClBufferUint8Ptr;
+ case ckw::TensorStorageType::Texture2dReadOnly:
+ return TensorStorageType::ClImage2dReadOnly;
+ case ckw::TensorStorageType::Texture2dWriteOnly:
+ return TensorStorageType::ClImage2dWriteOnly;
+ case ckw::TensorStorageType::Unknown:
+ return TensorStorageType::Unknown;
+ default:
+ ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
new file mode 100644
index 0000000000..26740cdd04
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include "compute_kernel_writer/include/ckw/TensorInfo.h"
+#include "compute_kernel_writer/include/ckw/types/DataType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Convert the Compute Library data type to Compute Kernel Writer data type
+ *
+ * @param[in] dt The Compute Library data type
+ *
+ * @return the Compute Kernel Writer data type (ckw::DataType)
+ */
+ckw::DataType to_ckw(DataType dt);
+
+/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape
+ *
+ * @param[in] shape The Compute Library tensor shape
+ *
+ * @return the Compute Kernel Writer tensor shape (ckw::TensorShape)
+ */
+ckw::TensorShape to_ckw(const TensorShape &shape);
+
+/** Convert the Compute Library data layout to Compute Kernel Writer data layout
+ *
+ * @param[in] dl The Compute Library data layout
+ *
+ * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout)
+ */
+ckw::TensorDataLayout to_ckw(DataLayout dl);
+
+/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info
+ *
+ * @param[in] tensor_info The Compute Library tensor info
+ *
+ * @return the Compute Kernel Writer tensor info (ckw::TensorInfo)
+ */
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info);
+
+/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage
+ *
+ * @param[in] storage The Compute Library tensor storage
+ *
+ * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType)
+ */
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage);
+
+/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component
+ *
+ * @param[in] component The Compute Kernel Writer tensor component
+ *
+ * @return the Compute Library tensor component
+ */
+TensorComponentType from_ckw(const ckw::TensorComponentType &component);
+
+/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage
+ *
+ * @param[in] storage The Compute Kernel Writer tensor storage
+ *
+ * @return the Compute Library tensor storage
+ */
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
new file mode 100644
index 0000000000..5630e390d5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
+{
+ switch (attributes.operation())
+ {
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
+ return ckw::BinaryOp::Add;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
+ return ckw::BinaryOp::Sub;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
+ return ckw::BinaryOp::Div;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
+ return ckw::BinaryOp::Mul;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
+ default:
+ ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
new file mode 100644
index 0000000000..644a407702
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+#include "compute_kernel_writer/include/ckw/types/Operators.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H