aboutsummaryrefslogtreecommitdiff
path: root/src/dynamic_fusion
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynamic_fusion')
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp148
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h81
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp380
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp105
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h70
-rw-r--r--src/dynamic_fusion/sketch/ArgumentPack.h237
-rw-r--r--src/dynamic_fusion/sketch/attributes/CastAttributes.cpp56
-rw-r--r--src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp58
-rw-r--r--src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp62
-rw-r--r--src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp86
-rw-r--r--src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp52
-rw-r--r--src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp91
-rw-r--r--src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp44
-rw-r--r--src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp90
-rw-r--r--src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp68
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuComponentServices.h54
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h160
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp71
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h111
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp368
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h156
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp73
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h93
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h127
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp62
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h74
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp168
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h115
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h54
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp152
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h107
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp69
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h134
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h300
-rw-r--r--src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h75
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp105
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h121
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp139
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h81
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp68
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h72
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp73
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h72
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h140
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp295
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h68
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp256
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h68
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp361
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h80
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp427
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h85
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp434
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h70
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp287
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h86
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp405
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h78
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp576
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h93
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp144
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h62
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp56
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h65
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp162
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h103
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp57
-rw-r--r--src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h42
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h65
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h118
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/Types.h52
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp83
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h119
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp87
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h133
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp224
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h174
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp166
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h151
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp127
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h117
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp148
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h123
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp107
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h131
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp69
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h102
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp91
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h126
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp57
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h100
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h75
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp72
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp172
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp172
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp270
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp278
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp245
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp70
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp134
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp208
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp163
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp195
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp164
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp200
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp71
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp163
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp200
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h117
-rw-r--r--src/dynamic_fusion/sketch/utils/DependencyGraph.h648
-rw-r--r--src/dynamic_fusion/utils/Utils.h83
111 files changed, 15752 insertions, 0 deletions
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
new file mode 100644
index 0000000000..eab5cddd07
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClKernelRuntime.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using namespace arm_compute::opencl;
+
+void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
+{
+ // Create kernel from kernel source string
+ opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
+ _kernel = static_cast<cl::Kernel>(compile_ctx.create_kernel(
+ code.name(),
+ code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
+ // Each program contains exactly one kernel
+ code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+ code.build_options().options(), false /* Is source binary */));
+
+ // Configure execution window
+ IClKernel::configure_internal(code.window());
+
+ // Set config id for lws tuning
+ _config_id = code.config_id();
+
+ // Set kernel arguments
+ _arguments = code.arguments();
+}
+
+inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx,
+ const GpuKernelArgumentBinding &arg,
+ const ICLTensor *tensor,
+ std::vector<cl::Image2D> &cl_images)
+{
+ switch (arg.type())
+ {
+ case GpuKernelArgumentBinding::Type::TensorStorage:
+ {
+ switch (arg.tensor_storage_type())
+ {
+ case TensorStorageType::ClBufferUint8Ptr:
+ {
+ cl_add_buffer_argument(_kernel, idx, tensor->cl_buffer());
+ break;
+ }
+ case TensorStorageType::ClImage2dReadOnly:
+ {
+ cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::ReadOnly);
+ cl_images.push_back(tensor_image2d);
+ cl_add_texture_argument(_kernel, idx, tensor_image2d);
+ break;
+ }
+ case TensorStorageType::ClImage2dWriteOnly:
+ {
+ cl::Image2D tensor_image2d = create_image2d_from_tensor(tensor, CLImage2DType::WriteOnly);
+ cl_images.push_back(tensor_image2d);
+ cl_add_texture_argument(_kernel, idx, tensor_image2d);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Do not accept other TensorStorageType");
+ break;
+ }
+ }
+ break;
+ }
+ case GpuKernelArgumentBinding::Type::TensorComponent:
+ {
+ cl_add_tensor_component_argument(_kernel, idx, tensor, arg.tensor_component_type());
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Do not accept other types of kernel arguments");
+ break;
+ }
+ }
+}
+
+void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
+ /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
+ constexpr bool skip_sliding_window = false;
+ constexpr bool use_dummy_work_items = false;
+
+ unsigned int idx = 0;
+ do
+ {
+ // Set kernel arguments
+ // CLImages created from tensor arguments. Need to be retained until enqueue
+ std::vector<cl::Image2D> cl_images;
+
+ for (const auto &arg : _arguments)
+ {
+ auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+ add_kernel_argument(idx, arg, tensor, cl_images);
+ }
+
+ // Dispatch kernel
+ enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
+ } while (skip_sliding_window && window.slide_window_slice_3D(slice));
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
new file mode 100644
index 0000000000..148e4db581
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelSourceCode;
+
+/** OpenCL runtime to run a single kernel */
+class ClKernelRuntime final : public opencl::IClKernel
+{
+public:
+ /** Configure the kernel runtime
+ *
+ * @param[in] compile_ctx OpenCL compile context
+ * @param[in] code Kernel source code
+ */
+ void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code);
+ /** Run the kernel
+ *
+ * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories
+ * @param[in] window Execution window
+ * @param[in] queue OpenCL command queue
+ */
+ virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ /** Set a kernel argument as part of a tensor
+ *
+ * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+ * @param[in] arg Kernel argument binding, as part of @p tensor
+ * @param[in] tensor Tensor of which the kernel argument @p arg is a part of
+ * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+ */
+ inline void add_kernel_argument(unsigned int &idx,
+ const GpuKernelArgumentBinding &arg,
+ const ICLTensor *tensor,
+ std::vector<cl::Image2D> &cl_images);
+
+private:
+ GpuKernelArgumentList _arguments{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME_H
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
new file mode 100644
index 0000000000..3500a0e60d
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
+ */
+class ClAuxTensors
+{
+public:
+ /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
+ */
+ struct DataView
+ {
+ DataView() = default;
+ DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+ : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
+ {
+ }
+ ~DataView() = default;
+ DataView(const DataView &other) = default;
+ DataView &operator=(const DataView &other) = default;
+ DataView(DataView &&other) = default;
+ DataView &operator=(DataView &&other) = default;
+ CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */
+ TensorInfo tensor_info{}; /**< Associated tensor info */
+ AuxMemoryInfo memory_info{}; /**< Memory requirement */
+ };
+
+ /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
+ std::vector<DataView> get_tensors()
+ {
+ return _tensors;
+ }
+ std::vector<DataView> get_tensors() const
+ {
+ return _tensors;
+ }
+
+ friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
+
+private:
+ /** Add auxiliary tensor.
+ *
+ * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
+ * @param[in] memory_info Memory requirements of the auxiliary tensor
+ *
+ * @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr
+ */
+ CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
+ {
+ const auto t_id = tensor_info.id();
+ auto find_tensor_pair = _owned_tensors.find(t_id);
+ if (find_tensor_pair != _owned_tensors.end())
+ {
+ return find_tensor_pair->second.get();
+ }
+ else
+ {
+ auto tensor = std::make_unique<CLTensor>();
+ auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
+ auto new_tensor = inserted_pair->second.get();
+ _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
+ return new_tensor;
+ }
+ }
+
+ std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
+ std::vector<DataView> _tensors{};
+};
+/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
+ *
+ * @note This is the only recommended method for user to create @ref ClAuxTensors
+ *
+ * @param[out] aux_tensors Auxiliary tensors required by the workload code
+ * @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
+ *
+ * @return Status
+ */
+Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
+{
+ for (auto t_id : code.tensors())
+ {
+ // Get tensor object
+ const auto workload_arg = code.query_tensor(t_id);
+ ICLTensor *tensor_object = nullptr;
+ if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+ {
+ // Create aux tensor CLTensor object
+ const TensorInfo tensor_info = *workload_arg->tensor_info();
+ ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
+ const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
+ tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
+
+ if (tensor_object == nullptr)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
+ }
+ }
+ }
+ return Status{};
+}
+
+/** A fast tensor lookup table for runtime tensor objects retrieval
+ */
+class ClTensorLUT
+{
+public:
+ /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
+ *
+ * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+ *
+ * @return ITensorPack*
+ */
+ ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
+ {
+ auto tensor_pack = _tensor_packs.find(uwk_id);
+ if (tensor_pack != _tensor_packs.end())
+ {
+ return &(tensor_pack->second);
+ }
+ return nullptr;
+ }
+ /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+ *
+ * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+ *
+ * @return ITensorPack*
+ */
+ ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
+ {
+ return _tensor_packs.at(uwk_id);
+ }
+
+ friend Status create_tensor_lut(ClTensorLUT *tensor_lut,
+ const GpuWorkloadSourceCode &code,
+ const std::vector<CLTensor *> &user_tensors,
+ const ClAuxTensors &aux_tensors);
+
+private:
+ /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
+ *
+ * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+ * @param[in] tensor_pack Tensor pack to be added
+ */
+ void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
+ {
+ _tensor_packs[uwk_id] = tensor_pack;
+ }
+ std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
+};
+
+/** Create a fast tensor lookup table for runtime tensor retrieval
+ *
+ * @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
+ * @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
+ * @param[in] user_tensors User tensors
+ * @param[in] aux_tensors Auxiliary tensors required by the workload code
+ *
+ * @return Status
+ */
+Status create_tensor_lut(ClTensorLUT *tensor_lut,
+ const GpuWorkloadSourceCode &code,
+ const std::vector<CLTensor *> &user_tensors,
+ const ClAuxTensors &aux_tensors)
+{
+ // Combine user tensors and aux tensors
+ std::map<ITensorInfo::Id, CLTensor *> tensor_map;
+ for (auto tensor : user_tensors)
+ {
+ const auto t_id = tensor->info()->id();
+
+ if (tensor_map.find(t_id) != tensor_map.end())
+ {
+ // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
+ std::vector<ITensorInfo::Id> ids;
+ for (auto &t : tensor_map)
+ {
+ ids.push_back(t.first);
+ }
+ ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;
+ tensor_map[new_id] = tensor;
+ }
+ else
+ {
+ tensor_map[t_id] = tensor;
+ }
+ }
+ for (const auto &data : aux_tensors.get_tensors())
+ {
+ const auto t_id = data.tensor_info.id();
+ const auto tensor = data.tensor;
+ if (tensor_map.find(t_id) != tensor_map.end())
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+ }
+ tensor_map[t_id] = tensor;
+ }
+
+ // Add tensor objects into corresponding tensor packs
+ for (auto id_tensor : tensor_map)
+ {
+ const auto t_id = id_tensor.first;
+ const auto tensor_object = id_tensor.second;
+ if (tensor_object == nullptr)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+ }
+ if (tensor_object->allocator()->info().total_size() == 0U)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
+ }
+
+ for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+ {
+ ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
+ if (tensor_pack == nullptr)
+ {
+ tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
+ }
+ else
+ {
+ tensor_pack->add_tensor(t_id, tensor_object);
+ }
+ }
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+struct ClWorkloadRuntime::Implementation
+{
+ std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
+ std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
+ bool _is_configured{false};
+ bool _is_prepared{false};
+ ClTensorLUT _tensor_lut{};
+ ClAuxTensors _aux_tensors{};
+ GpuWorkloadSourceCode _source_code{};
+};
+
+ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
+{
+}
+
+ClWorkloadRuntime::~ClWorkloadRuntime() = default;
+
+ClWorkloadRuntime::ClWorkloadRuntime(ClWorkloadRuntime &&) = default;
+
+ClWorkloadRuntime &ClWorkloadRuntime::operator=(ClWorkloadRuntime &&) = default;
+
+Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
+ "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+ // Generate source code
+ _impl->_source_code = sketch.implementation().generate_source_code();
+ // Configure unit workload from source code
+ for (auto uwk_id : _impl->_source_code.unit_workloads())
+ {
+ const auto work = _impl->_source_code.query_unit_workload(uwk_id);
+ const auto stage = work.stage().stage;
+ auto k = std::make_unique<ClKernelRuntime>();
+ k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
+
+ switch (stage)
+ {
+ case UnitWorkloadStage::Stage::Run:
+ {
+ _impl->_kernels.emplace(work.id(), std::move(k));
+ break;
+ }
+ case UnitWorkloadStage::Stage::Prepare:
+ {
+ _impl->_kernels_prep.emplace(work.id(), std::move(k));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Invalid unit workload stage");
+ }
+ }
+ }
+ // Create auxiliary tensor objects
+ create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
+ _impl->_is_configured = true;
+ return Status{};
+}
+
+void ClWorkloadRuntime::prepare()
+{
+ if (!_impl->_is_prepared)
+ {
+ for (auto &id_kernel_pair : _impl->_kernels_prep)
+ {
+ const bool flush_queue = false;
+ const auto uwk_id = id_kernel_pair.first;
+ auto kernel = id_kernel_pair.second.get();
+ CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+ }
+
+ _impl->_is_prepared = true;
+ }
+}
+
+Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
+{
+ // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
+ // in which case the lut can be cached during prepare
+ const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ prepare();
+ for (auto &id_kernel_pair : _impl->_kernels)
+ {
+ // Flush the command queue on the last kernel
+ const bool flush_queue = false;
+ const auto uwk_id = id_kernel_pair.first;
+ auto kernel = id_kernel_pair.second.get();
+ CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+ }
+ return Status{};
+}
+
+std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
+{
+ std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
+ for (const auto &data : _impl->_aux_tensors.get_tensors())
+ {
+ aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
+ }
+ return aux_tensors;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
new file mode 100644
index 0000000000..7044b0ea66
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "GpuCkwKernelArgumentsHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void cl_add_tensor_component_argument(cl::Kernel &kernel,
+ unsigned int &idx,
+ const ICLTensor *tensor,
+ TensorComponentType component)
+{
+ ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+ const auto *info = tensor->info();
+ const auto &strides = info->strides_in_bytes();
+
+ switch (component)
+ {
+ case TensorComponentType::OffsetFirstElement:
+ kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());
+ break;
+ case TensorComponentType::Stride0:
+ kernel.setArg<cl_uint>(idx++, strides[0]);
+ break;
+ case TensorComponentType::Stride1:
+ kernel.setArg<cl_uint>(idx++, strides[1]);
+ break;
+ case TensorComponentType::Stride2:
+ kernel.setArg<cl_uint>(idx++, strides[2]);
+ break;
+ case TensorComponentType::Stride3:
+ kernel.setArg<cl_uint>(idx++, strides[3]);
+ break;
+ case TensorComponentType::Stride4:
+ kernel.setArg<cl_uint>(idx++, strides[4]);
+ break;
+ case TensorComponentType::Dim0:
+ kernel.setArg<cl_uint>(idx++, info->dimension(0));
+ break;
+ case TensorComponentType::Dim1:
+ kernel.setArg<cl_uint>(idx++, info->dimension(1));
+ break;
+ case TensorComponentType::Dim2:
+ kernel.setArg<cl_uint>(idx++, info->dimension(2));
+ break;
+ case TensorComponentType::Dim3:
+ kernel.setArg<cl_uint>(idx++, info->dimension(3));
+ break;
+ case TensorComponentType::Dim4:
+ kernel.setArg<cl_uint>(idx++, info->dimension(4));
+ break;
+ case TensorComponentType::Dim1xDim2:
+ kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2));
+ break;
+ case TensorComponentType::Dim2xDim3:
+ kernel.setArg<cl_uint>(idx++, info->dimension(2) * info->dimension(3));
+ break;
+ case TensorComponentType::Dim1xDim2xDim3:
+ kernel.setArg<cl_uint>(idx++, info->dimension(1) * info->dimension(2) * info->dimension(3));
+ break;
+ case TensorComponentType::Unknown:
+ default:
+ ARM_COMPUTE_ERROR("Unknown tensor component");
+ }
+}
+
+void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buffer &buffer)
+{
+ kernel.setArg(idx++, buffer);
+}
+
+void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image)
+{
+ kernel.setArg(idx++, image);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
new file mode 100644
index 0000000000..306d547acb
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS
+#define ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS
+
+#include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Select a Compute Kernel Writer tensor component from a tensor and add to the kernel's arguments at the specified index idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx Index at which to add the argument.
+ * @param[in] tensor Tensor from which to access the tensor component.
+ * @param[in] component Tensor component to select such as tensor dimensions, strides, etc.
+ */
+void cl_add_tensor_component_argument(cl::Kernel &kernel,
+ unsigned int &idx,
+ const ICLTensor *tensor,
+ TensorComponentType component);
+
+/** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx Index at which to add the argument.
+ * @param[in] buffer OpenCL buffer containing the tensor's data.
+ */
+void cl_add_buffer_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Buffer &buffer);
+
+/** Add an OpenCL image object to the kernel's arguments at the specified index @p idx.
+ *
+ * @param[in,out] kernel OpenCL kernel to configure with the provided argument.
+ * @param[in,out] idx Index at which to add the argument.
+ * @param[in] image OpenCL image containing the image's data.
+ */
+void cl_add_texture_argument(cl::Kernel &kernel, unsigned int &idx, const cl::Image &image);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CKW_DRIVER_GPUCKWKERNELARGUMENTSHELPERS */
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
new file mode 100644
index 0000000000..d030bc3d45
--- /dev/null
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types
+ * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor
+ *
+ * The argument id is the position of the argument within the pack, and is represented by @ref TensorType
+ *
+ * @tparam T Tensor-related type
+ */
+template <typename T>
+class ArgumentPack
+{
+public:
+ /** @ref arm_compute::TensorType encodes the position of a tensor argument within the pack */
+ using Id = TensorType;
+ /** A single argument element within the pack
+ * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time
+ */
+ struct PackElement
+ {
+ PackElement() = default;
+ PackElement(const PackElement &elem) = default;
+ PackElement &operator=(const PackElement &elem) = default;
+ PackElement(PackElement &&elem) = default;
+ PackElement &operator=(PackElement &&elem) = default;
+ PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr)
+ {
+ }
+ PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
+ {
+ }
+
+ Id id{ACL_UNKNOWN}; /**< Argument id within the pack */
+ T *tensor{nullptr}; /**< Non-const pointer to tensor-related object */
+ const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */
+ };
+
+public:
+ /** Default constructor */
+ ArgumentPack() = default;
+ /** Destructor */
+ ~ArgumentPack() = default;
+ /** Allow instances of this class to be copy constructed */
+ ArgumentPack<T>(const ArgumentPack<T> &other) = default;
+ /** Allow instances of this class to be copied */
+ ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default;
+ /** Allow instances of this class to be move constructed */
+ ArgumentPack<T>(ArgumentPack<T> &&other) = default;
+ /** Allow instances of this class to be moved */
+ ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
+ /** Initializer list Constructor */
+ ArgumentPack(const std::initializer_list<PackElement> &l) : _pack{}
+ {
+ for (const auto &e : l)
+ {
+ _pack[e.id] = e;
+ }
+ }
+ /** Add tensor to the pack
+ *
+ * @param[in] id ID of the tensor to add
+ * @param[in] tensor Tensor to add
+ */
+ void add_tensor(Id id, T *tensor)
+ {
+ _pack[id] = PackElement(id, tensor);
+ }
+ /** Add const tensor to the pack
+ *
+ * @param[in] id ID of the tensor to add
+ * @param[in] tensor Tensor to add
+ */
+ void add_const_tensor(Id id, const T *tensor)
+ {
+ _pack[id] = PackElement(id, tensor);
+ }
+ /** Get tensor of a given id from the pack
+ *
+ * @param[in] id ID of tensor to extract
+ *
+ * @return The pointer to the tensor if exist and is non-const else nullptr
+ */
+ T *get_tensor(Id id)
+ {
+ auto it = _pack.find(id);
+ return it != _pack.end() ? it->second.tensor : nullptr;
+ }
+ /** Get constant tensor of a given id
+ *
+ * @param[in] id ID of tensor to extract
+ *
+ * @return The pointer to the tensor (const or not) if exist else nullptr
+ */
+ const T *get_const_tensor(Id id) const
+ {
+ auto it = _pack.find(id);
+ if (it != _pack.end())
+ {
+ return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+ }
+ return nullptr;
+ }
+ /** Remove the tensor stored with the given id
+ *
+ * @param[in] id ID of tensor to remove
+ */
+ void remove_tensor(Id id)
+ {
+ _pack.erase(id);
+ }
+ /** Pack size accessor
+ *
+ * @return Number of tensors registered to the pack
+ */
+ size_t size() const
+ {
+ return _pack.size();
+ }
+ /** Checks if pack is empty
+ *
+ * @return True if empty else false
+ */
+ bool empty() const
+ {
+ return _pack.empty();
+ }
+ /** Get the ACL_SRC_* tensors
+ *
+ * @return std::vector<T *>
+ */
+ std::vector<T *> get_src_tensors()
+ {
+ std::vector<T *> src_tensors{};
+ for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+ {
+ auto tensor = get_tensor(static_cast<TensorType>(id));
+ if (tensor != nullptr)
+ {
+ src_tensors.push_back(tensor);
+ }
+ }
+ return src_tensors;
+ }
+ /** Get the const ACL_SRC_* tensors
+ *
+ * @return std::vector<const T *>
+ */
+ std::vector<const T *> get_const_src_tensors() const
+ {
+ std::vector<const T *> src_tensors{};
+ for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+ {
+ auto tensor = get_const_tensor(static_cast<TensorType>(id));
+ if (tensor != nullptr)
+ {
+ src_tensors.push_back(tensor);
+ }
+ }
+ return src_tensors;
+ }
+ /** Get the ACL_DST_* tensors
+ *
+ * @return std::vector<T *>
+ */
+ std::vector<T *> get_dst_tensors()
+ {
+ std::vector<T *> dst_tensors{};
+ for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+ {
+ auto tensor = get_tensor(static_cast<TensorType>(id));
+ if (tensor != nullptr)
+ {
+ dst_tensors.push_back(tensor);
+ }
+ }
+ return dst_tensors;
+ }
+ /** Get the const ACL_DST_* tensors
+ *
+ * @return std::vector<const T *>
+ */
+ std::vector<const T *> get_const_dst_tensors() const
+ {
+ std::vector<const T *> dst_tensors{};
+ for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+ {
+ auto tensor = get_const_tensor(static_cast<TensorType>(id));
+ if (tensor != nullptr)
+ {
+ dst_tensors.push_back(tensor);
+ }
+ }
+ return dst_tensors;
+ }
+
+private:
+ std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK_H
diff --git a/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp b/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp
new file mode 100644
index 0000000000..4ad94268f4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/CastAttributes.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+CastAttributes &CastAttributes::data_type(const DataType &data_type)
+{
+ _data_type = data_type;
+ return *this;
+}
+
+DataType CastAttributes::data_type() const
+{
+ return _data_type;
+}
+
+CastAttributes &CastAttributes::convert_policy(const ConvertPolicy &convert_policy)
+{
+ _convert_policy = convert_policy;
+ return *this;
+}
+
+ConvertPolicy CastAttributes::convert_policy() const
+{
+ return _convert_policy;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp
new file mode 100644
index 0000000000..b177f760df
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/ClampAttributes.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+ClampAttributes &ClampAttributes::min_val(const float &min_val)
+{
+ _min_val = min_val;
+ return *this;
+}
+
+float ClampAttributes::min_val() const
+{
+ return _min_val;
+}
+
+ClampAttributes &ClampAttributes::max_val(const float &max_val)
+{
+ _max_val = max_val;
+ return *this;
+}
+
+float ClampAttributes::max_val() const
+{
+ return _max_val;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp
new file mode 100644
index 0000000000..97e74f742d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/Conv2dAttributes.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad)
+{
+ _pad = pad;
+ return *this;
+}
+Padding2D Conv2dAttributes::pad() const
+{
+ return _pad;
+}
+Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride)
+{
+ _stride = stride;
+ return *this;
+}
+Size2D Conv2dAttributes::stride() const
+{
+ return _stride;
+}
+Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation)
+{
+ _dilation = dilation;
+ return *this;
+}
+Size2D Conv2dAttributes::dilation() const
+{
+ return _dilation;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
new file mode 100644
index 0000000000..6f3816568c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::pad(const Padding2D &pad)
+{
+ _pad = pad;
+ return *this;
+}
+Padding2D DepthwiseConv2dAttributes::pad() const
+{
+ return _pad;
+}
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::stride(const Size2D &stride)
+{
+ _stride = stride;
+ return *this;
+}
+Size2D DepthwiseConv2dAttributes::stride() const
+{
+ return _stride;
+}
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dilation(const Size2D &dilation)
+{
+ _dilation = dilation;
+ return *this;
+}
+Size2D DepthwiseConv2dAttributes::dilation() const
+{
+ return _dilation;
+}
+
+DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::depth_multiplier(const uint32_t &depth_multiplier)
+{
+ _depth_multiplier = depth_multiplier;
+ return *this;
+}
+
+uint32_t DepthwiseConv2dAttributes::depth_multiplier() const
+{
+ return _depth_multiplier;
+}
+
+DepthwiseConv2dAttributes &
+DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
+{
+ _dimension_rounding_type = dimension_rounding_type;
+ return *this;
+}
+
+DimensionRoundingType DepthwiseConv2dAttributes::dimension_rounding_type() const
+{
+ return _dimension_rounding_type;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp
new file mode 100644
index 0000000000..027b550377
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/MatMulAttributes.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+MatMulAttributes MatMulAttributes::adj_lhs(bool adj_lhs)
+{
+ _adj_lhs = adj_lhs;
+ return *this;
+}
+MatMulAttributes MatMulAttributes::adj_rhs(bool adj_rhs)
+{
+ _adj_rhs = adj_rhs;
+ return *this;
+}
+bool MatMulAttributes::adj_lhs() const
+{
+ return _adj_lhs;
+}
+bool MatMulAttributes::adj_rhs() const
+{
+ return _adj_rhs;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
new file mode 100644
index 0000000000..80f65f926a
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+#include "arm_compute/core/Size2D.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+PoolingType Pool2dAttributes::pool_type() const
+{
+ return _pool_type;
+}
+
+Pool2dAttributes Pool2dAttributes::pool_type(PoolingType pool_type)
+{
+ _pool_type = pool_type;
+ return *this;
+}
+
+Padding2D Pool2dAttributes::pad() const
+{
+ return _pad;
+}
+
+Pool2dAttributes Pool2dAttributes::pad(const Padding2D &pad)
+{
+ _pad = pad;
+ return *this;
+}
+
+Size2D Pool2dAttributes::pool_size() const
+{
+ return _pool_size;
+}
+
+Pool2dAttributes Pool2dAttributes::pool_size(const Size2D &pool_size)
+{
+ _pool_size = pool_size;
+ return *this;
+}
+
+Size2D Pool2dAttributes::stride() const
+{
+ return _stride;
+}
+
+Pool2dAttributes Pool2dAttributes::stride(const Size2D &stride)
+{
+ _stride = stride;
+ return *this;
+}
+
+bool Pool2dAttributes::exclude_padding() const
+{
+ return _exclude_padding;
+}
+
+Pool2dAttributes Pool2dAttributes::exclude_padding(bool exclude_padding)
+{
+ _exclude_padding = exclude_padding;
+ return *this;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp
new file mode 100644
index 0000000000..0938c0df84
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/ReshapeAttributes.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ReshapeAttributes &ReshapeAttributes::shape(const TensorShape &shape)
+{
+ _shape = shape;
+ return *this;
+}
+TensorShape ReshapeAttributes::shape() const
+{
+ return _shape;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp b/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp
new file mode 100644
index 0000000000..1919dbc72d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/ResizeAttributes.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ResizeAttributes &ResizeAttributes::output_width(int32_t output_width)
+{
+ _output_width = output_width;
+ return *this;
+}
+
+int32_t ResizeAttributes::output_width() const
+{
+ return _output_width;
+}
+
+ResizeAttributes &ResizeAttributes::output_height(int32_t output_height)
+{
+ _output_height = output_height;
+ return *this;
+}
+
+int32_t ResizeAttributes::output_height() const
+{
+ return _output_height;
+}
+
+ResizeAttributes &ResizeAttributes::interpolation_policy(InterpolationPolicy interpolation_policy)
+{
+ _interpolation_policy = interpolation_policy;
+ return *this;
+}
+
+InterpolationPolicy ResizeAttributes::interpolation_policy() const
+{
+ return _interpolation_policy;
+}
+
+ResizeAttributes &ResizeAttributes::sampling_policy(SamplingPolicy sampling_policy)
+{
+ _sampling_policy = sampling_policy;
+ return *this;
+}
+
+SamplingPolicy ResizeAttributes::sampling_policy() const
+{
+ return _sampling_policy;
+}
+
+ResizeAttributes &ResizeAttributes::align_corners(bool align_corners)
+{
+ _align_corners = align_corners;
+ return *this;
+}
+
+bool ResizeAttributes::align_corners() const
+{
+ return _align_corners;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp b/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp
new file mode 100644
index 0000000000..5d4d666263
--- /dev/null
+++ b/src/dynamic_fusion/sketch/attributes/SoftmaxAttributes.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+SoftmaxAttributes &SoftmaxAttributes::beta(float beta)
+{
+ _beta = beta;
+ return *this;
+}
+
+float SoftmaxAttributes::beta() const
+{
+ return _beta;
+}
+
+SoftmaxAttributes &SoftmaxAttributes::is_log_softmax(bool is_log_softmax)
+{
+ _is_log_softmax = is_log_softmax;
+ return *this;
+}
+
+bool SoftmaxAttributes::is_log_softmax() const
+{
+ return _is_log_softmax;
+}
+
+SoftmaxAttributes &SoftmaxAttributes::axis(int axis)
+{
+ _axis = axis;
+ return *this;
+}
+
+int SoftmaxAttributes::axis() const
+{
+ return _axis;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
new file mode 100644
index 0000000000..93881508bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+
+#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Services that are used throughout the creation phase of workload code
+ */
+class GpuComponentServices
+{
+public:
+ /** Default constructor */
+ GpuComponentServices() = default;
+ /** Get reference to component factory */
+ GpuKernelComponentFactory &component_factory()
+ {
+ return _comp_factory;
+ }
+
+private:
+ GpuKernelComponentFactory _comp_factory{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
new file mode 100644
index 0000000000..c923bf9c16
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
+
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Describe how the tensor runtime memory can be accessed
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorStorageType
+{
+ Unknown,
+ ClBufferUint8Ptr,
+ ClImage2dReadOnly,
+ ClImage2dWriteOnly,
+};
+
+/** Describe additional runtime information about the tensor
+ *
+ * Please see documentation under @ref GpuKernelArgumentBinding
+ */
+enum class TensorComponentType
+{
+ Unknown,
+ OffsetFirstElement,
+ Stride0,
+ Stride1,
+ Stride2,
+ Stride3,
+ Stride4,
+ Dim0,
+ Dim1,
+ Dim2,
+ Dim3,
+ Dim4,
+ Dim1xDim2,
+ Dim2xDim3,
+ Dim1xDim2xDim3,
+};
+
+/** Describe how to extract information from a runtime Gpu tensor, and set it as an argument to a gpu kernel at runtime
+ *
+ * A kernel argument is just an argument to the gpu kernel as shown in the argument list below. This contrasts with a "workload argument" which is a tensor (@ref GpuWorkloadArgument)
+ * void kernel(arg0, arg1, ... argN)
+ *
+ * In a kernel generated using dynamic fusion (@ref GpuKernelSourceCode), every kernel argument describes part of a tensor.
+ * A tensor is described as: **storages** followed by **components**
+ *
+ * A storage (@ref TensorStorageType) describes how the tensor runtime memory can be accessed (e.g. via a global uint8 pointer to a CL buffer)
+ * A component (@ref TensorComponentType) describes additional runtime information about the tensor (e.g. the dimensions of the tensor)
+ *
+ * The arguments are arranged in the order of use in the generated kernel code:
+ *
+ * arg0 , arg1 , arg2 , ..., , argN
+ * storage, component0, component1, ..., componentX, storage, component0, component1, ..., componentY
+ * | tensor0 | tensor1 |
+ *
+ * An example argument list:
+ *
+ * void kernel(
+ * image2d_t t0_image, // TensorStorageType::ClImage2dReadOnly
+ * uint8_t* t0_ptr, // TensorStorageType::ClBufferUint8Ptr
+ * uint t0_dim0, // TensorComponentType::Dim0
+ * uint t0_stride1, // TensorComponentType::Stride1
+ * image2d_t t1_ptr, // TensorStorageType::ClImage2dReadOnly
+ * uint t1_dim1xdim2, // TensorComponentType::Dim1xDim2
+ * uint t1_stride1, // TensorComponentType::Stride1
+ * uint t1_stride2, // TensorComponentType:Stride2
+ * )
+ *
+ */
+class GpuKernelArgumentBinding
+{
+public:
+ enum class Type : int32_t
+ {
+ TensorStorage, /** @ref TensorStorageType */
+ TensorComponent /** @ref TensorComponentType */
+ };
+ GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
+ : _type{Type::TensorStorage}, _id{id}, _value{}
+ {
+ _value.tensor_storage_type = storage;
+ }
+ GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
+ : _type{Type::TensorComponent}, _id{id}, _value{}
+ {
+ _value.tensor_component_type = component;
+ }
+ /** Storage type of the tensor
+ */
+ TensorStorageType tensor_storage_type() const
+ {
+ ARM_COMPUTE_ERROR_ON(_type != Type::TensorStorage);
+ return _value.tensor_storage_type;
+ }
+ /** Component of the tensor
+ */
+ TensorComponentType tensor_component_type() const
+ {
+ ARM_COMPUTE_ERROR_ON(_type != Type::TensorComponent);
+ return _value.tensor_component_type;
+ }
+ /** Id of the tensor this kernel argument belongs to
+ */
+ ITensorInfo::Id id() const
+ {
+ return _id;
+ }
+ /** Type of the kernel argument
+ */
+ Type type() const
+ {
+ return _type;
+ }
+
+private:
+ Type _type;
+ ITensorInfo::Id _id;
+ union Value
+ {
+ TensorStorageType tensor_storage_type;
+ TensorComponentType tensor_component_type;
+ };
+ Value _value;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
new file mode 100644
index 0000000000..1a458c9862
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGraph.h"
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+std::vector<DependencyGraph::TensorId>
+GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+ std::vector<DependencyGraph::TensorId> tensor_ids{};
+ std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+ [](const auto &t) { return t->id(); });
+ return tensor_ids;
+}
+
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
+ : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{}
+{
+}
+
+GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
+{
+ GpuKernelComponentStream stream{_context, _services, mem_map};
+ const auto op_seq = _dependency_graph.build_operators_sequence();
+
+ stream.new_component_group();
+ for (auto op : op_seq)
+ {
+ const auto component = _components.at(op.op).get();
+ const auto success = stream.add_component(component);
+ if (!success) // Assume first failure was because the root component is unfusable
+ {
+ stream.new_component_group();
+ const auto success = stream.add_component(component);
+ ARM_COMPUTE_ERROR_ON(!success);
+ ARM_COMPUTE_UNUSED(success);
+ }
+ }
+
+ return stream;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
new file mode 100644
index 0000000000..6f871a3c90
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components
+ * Its main purposes are:
+ * - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators)
+ * - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology
+ */
+class GpuKernelComponentGraph
+{
+public:
+ /** Constructor
+ *
+ * @param[in] context @ref GpuWorkloadContext to be used by the graph
+ * @param[in] services @ref GpuComponentServices to be used by the graph
+ */
+ GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services);
+ /** Prevent instances of this class from being copy constructed */
+ GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
+ /** Prevent instances of this class from being copied */
+ GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete;
+ /** Allow instances of this class to be move constructed */
+ GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default;
+ /** Allow instances of this class to be moved */
+ GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default;
+ /** Create a new component and add it to the component graph
+ * Component id is automatically allocated
+ *
+ * @tparam T Component type
+ * @tparam Args Component argument types
+ *
+ * @param[in] args Component arguments except for component id, which is auto-allocated
+ */
+ template <typename T, typename... Args>
+ void add_new_component(Args &&...args)
+ {
+ auto comp = _services->component_factory().create<T>(std::forward<Args>(args)...);
+ ArgumentPack<ITensorInfo> tensors = comp->tensors();
+ const auto src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
+ const auto dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
+ bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+ ARM_COMPUTE_UNUSED(success);
+ ARM_COMPUTE_ERROR_ON(!success);
+ _components[comp->id()] = std::move(comp);
+ for (auto t : tensors.get_const_src_tensors())
+ {
+ _tensors[t->id()] = t;
+ }
+ for (auto t : tensors.get_const_dst_tensors())
+ {
+ _tensors[t->id()] = t;
+ }
+ }
+ /** Perform component fusion and serialize the graph into a stream of component groups
+ *
+ * @param[in] mem_map MemoryDescriptorMap for all the tensors in the component graph
+ *
+ * @return GpuKernelComponentStream
+ */
+ GpuKernelComponentStream fuse(const MemoryDescriptorMap &mem_map) const;
+
+private:
+ static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+ GpuWorkloadContext *_context;
+ GpuComponentServices *_services;
+ std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
+ std::map<ITensorInfo::Id, const ITensorInfo *> _tensors;
+ DependencyGraph _dependency_graph{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
new file mode 100644
index 0000000000..5a6d125d96
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGroup.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool GpuKernelComponentGroup::add_component(ComponentPtr component)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered.");
+
+ // note: Constraint 1 is guaranteed as a precondition
+ // Constraint 2
+ if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+ {
+ return false;
+ }
+ // Constraint 3.1: Pattern: (Unfusable + Output)
+ if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable &&
+ component->type() != GpuComponentType::Output)
+ {
+ return false;
+ }
+ // Constraint 3.2
+ if (!_components.empty() &&
+ (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+ {
+ return false;
+ }
+ // Constraint 4
+ if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+ {
+ return false;
+ }
+ // Constraint 5
+ if (!_components.empty() && !(get_root_component()->properties() == component->properties()))
+ {
+ return false;
+ }
+ // Constraint 7
+ if (!_components.empty())
+ {
+ const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor = root_dst_tensors[0];
+ const auto dst_tensors = component->tensors().get_const_dst_tensors();
+ for (const auto &t : root_dst_tensors)
+ {
+ if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ for (const auto &t : dst_tensors)
+ {
+ if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ }
+ // Constraint 8
+ if (!_components.empty())
+ {
+ const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+ const auto dst_tensors = component->tensors().get_const_dst_tensors();
+ for (const auto &t : root_dst_tensors)
+ {
+ if (t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ for (const auto &t : dst_tensors)
+ {
+ if (t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ }
+ // Constraint 9
+ if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+ {
+ return false;
+ }
+ // Constraint 9 corollary
+ if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+ {
+ return false;
+ }
+ _components.push_back(component);
+ return true;
+}
+
+void GpuKernelComponentGroup::finalize()
+{
+ if (_finalized)
+ {
+ return;
+ }
+
+ _finalized = true;
+
+ std::set<const ITensorInfo *> output_tensors;
+ std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
+ std::map<const ITensorInfo *, int32_t> tile_usages;
+
+ for (auto component : _components)
+ {
+ const auto tensors = component->tensors();
+ const auto src_tensors = tensors.get_const_src_tensors();
+ const auto dst_tensors = tensors.get_const_dst_tensors();
+
+ // Detect input, output and intermediate tensors.
+ for (auto tensor : src_tensors)
+ {
+ const auto output_tensors_it = output_tensors.find(tensor);
+
+ if (output_tensors_it != output_tensors.end())
+ {
+ // This tensor is the output of another operator.
+ // It must be marked as intermediate tensor.
+ output_tensors.erase(output_tensors_it);
+ _interm_tensors.insert(tensor);
+ }
+ else if (_interm_tensors.find(tensor) == _interm_tensors.end())
+ {
+ _input_tensors.insert(tensor);
+
+ tile_usages[tensor] = 0;
+ possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+ }
+ }
+
+ for (auto tensor : dst_tensors)
+ {
+ ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
+ ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
+ ARM_COMPUTE_ERROR_ON(_interm_tensors.find(tensor) != _interm_tensors.end());
+ output_tensors.insert(tensor);
+
+ tile_usages[tensor] = 0;
+ possible_tile_map.emplace(tensor, std::vector<const ITensorInfo *>());
+ }
+
+ // Check if the output can overwrite the input tile.
+ const auto component_type = component->type();
+ if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+ {
+ ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
+
+ const auto dst_tensor = dst_tensors[0];
+ const auto &dst_shape = dst_tensor->tensor_shape();
+ const auto &dst_type = dst_tensor->data_type();
+
+ tile_usages[dst_tensor] = 0;
+
+ for (auto src_tensor : src_tensors)
+ {
+ const auto &src_shape = src_tensor->tensor_shape();
+ const auto &src_type = src_tensor->data_type();
+
+ if (src_shape == dst_shape && src_type == dst_type)
+ {
+ const auto tile_usages_it = tile_usages.find(src_tensor);
+ ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
+
+ if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+ {
+ // Increase the number of tile usages unless this component is an output
+ // and the tile has not been shared with any component.
+ // (Reason: output component doesn't change the content of the tile)
+ ++tile_usages_it->second;
+ }
+
+ possible_tile_map[dst_tensor].push_back(src_tensor);
+ }
+ }
+ }
+ else
+ {
+ // Outputs of complex and unfusable components need dedicated tile.
+ for (auto tensor : dst_tensors)
+ {
+ tile_usages[tensor] = 0;
+ }
+ }
+ }
+
+ // Find the smallest list of tiles that the intermediate tensors need to write to.
+ for (auto tensor : _input_tensors)
+ {
+ _tile_map[tensor] = tensor;
+ }
+
+ for (auto component : _components)
+ {
+ const auto dst_tensors = component->tensors().get_const_dst_tensors();
+
+ for (auto tensor : dst_tensors)
+ {
+ const auto target_tiles = possible_tile_map.at(tensor);
+ _tile_map[tensor] = tensor;
+
+ for (auto target : target_tiles)
+ {
+ const auto num_usage = tile_usages[target];
+
+ if (num_usage <= 1)
+ {
+ // The target tile is consumed by only this operator, so we can reuse it
+ // for the destination tensor data.
+ _tile_map[tensor] = _tile_map.at(target);
+ break;
+ }
+ }
+ }
+ }
+
+ for (auto tensor : output_tensors)
+ {
+ _tile_map[tensor] = tensor;
+ }
+
+ // All intermediate tensors that cannot be shared with any previous tensor
+ // will need to be declared as tile variable.
+ for (auto tensor_tile : _tile_map)
+ {
+ if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+ {
+ _tiles.push_back(tensor_tile.first);
+ }
+ }
+
+ std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(),
+ std::back_inserter(_argument_tensors));
+ _any_output_tensor = *output_tensors.begin();
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_tiles() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _tiles;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInfo *tensor) const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+
+ if (_tile_map.find(tensor) != _tile_map.end())
+ {
+ return _tile_map.at(tensor);
+ }
+
+ return tensor;
+}
+
+const ITensorInfo *GpuKernelComponentGroup::get_any_dst_tensor() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _any_output_tensor;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _argument_tensors;
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
+{
+ if (empty())
+ {
+ return nullptr;
+ }
+ return _components[0];
+}
+
+bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _interm_tensors.find(tensor) != _interm_tensors.end();
+}
+
+bool GpuKernelComponentGroup::is_input_tensor(const ITensorInfo *tensor) const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
+ return _input_tensors.find(tensor) != _input_tensors.end();
+}
+
+size_t GpuKernelComponentGroup::size() const
+{
+ return _components.size();
+}
+bool GpuKernelComponentGroup::empty() const
+{
+ return _components.empty();
+}
+GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
+{
+ return _components[index];
+}
+const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
+{
+ return _components[index];
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
+{
+ return _components.begin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
+{
+ return _components.end();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
+{
+ return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
+{
+ return _components.cend();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
+{
+ return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
+{
+ return _components.cend();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
new file mode 100644
index 0000000000..6ad71abb39
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+
+#include "components/Types.h"
+#include <cstdint>
+#include <cstdlib>
+#include <map>
+#include <set>
+#include <vector>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+/** A group of gpu kernel components to be fused together
+ * PRECONDITIONS:
+ * 1. Fusion is limited to a linear sequence of kernel components
+ * INVARIANTS:
+ * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components
+ * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components (
+ * excluding any output or input (if any) components.
+ * The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group
+ * )
+ * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output?
+ * 4. All components but unfusable, have exactly 1 dst tensor
+ * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. )
+ * 6. All fused components share the same tunable parameters like tile size
+ * 7. All fused components share the same dst tensor shape
+ * 8. All fused components' tensors share the same @ref DataLayout
+ * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors
+ * This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors
+ */
+class GpuKernelComponentGroup
+{
+public:
+ using ComponentPtr = IGpuKernelComponent *;
+ /** Maximum number of components that can be fused into the same component group
+ */
+ static constexpr size_t max_fused_components = 64;
+ /** Maximum number of dst tensors allowed for a component / component
+ */
+ static constexpr size_t max_dst_tensors = 8;
+
+public:
+ /** Default constructor */
+ GpuKernelComponentGroup() = default;
+ /** Allow instances of this class to be copy constructed */
+ GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default;
+ /** Allow instances of this class to be copied */
+ GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default;
+ /** Allow instances of this class to be moved */
+ GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default;
+ /** Add a component pointer into the group
+ * If the operation fails, then no change is made to the group
+ *
+ * @param[in] component Pointer to the component to be added
+ *
+ * @return true If the operation is successful
+ * @return false If the operation fails
+ */
+ bool add_component(ComponentPtr component);
+ /** Optimize and pre-compute information about the component group */
+ void finalize();
+ /** Get one of the destination tensors of this group */
+ const ITensorInfo *get_any_dst_tensor() const;
+ /** Get tensor argument of this group
+ * A tensor is an argument if it is a source or destination tensor to the group
+ */
+ std::vector<const ITensorInfo *> get_argument_tensors() const;
+ /** Get the root (first) component of this group */
+ ComponentPtr get_root_component() const;
+ /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
+ *
+ * An intermediate tensor is any tensor that is not an argument.
+ *
+ * @param[in] tensor @ref ITensorInfo to be looked up
+ *
+ * @return true If @p tensor is an intermediate tensor
+ * @return false Otherwise
+ */
+ bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+ /** Check if an @ref ITensorInfo is an input tensor of the group.
+ *
+ * @param[in] tensor @ref ITensorInfo to be looked up.
+ *
+ * @return true if @p tensor is an input tensor of the group, otherwise false.
+ */
+ bool is_input_tensor(const ITensorInfo *tensor) const;
+ /** Get the list of temporary tiles that need to be declared */
+ std::vector<const ITensorInfo *> get_tiles() const;
+ /** Get the shared tile that can be used to store temporary data of the specified tensor.
+ *
+ * @param[in] tensor @ref ITensorInfo to be looked up.
+ *
+ * @return @ref ITensorInfo that is used to store temporary data of @p tensor.
+ **/
+ const ITensorInfo *get_tile_for_tensor(const ITensorInfo *tensor) const;
+ /** Get the number of components within the group */
+ size_t size() const;
+ /** Check if the component group is empty */
+ bool empty() const;
+ ComponentPtr &operator[](size_t index);
+ const ComponentPtr &operator[](size_t index) const;
+ typename std::vector<ComponentPtr>::iterator begin();
+ typename std::vector<ComponentPtr>::iterator end();
+ typename std::vector<ComponentPtr>::const_iterator begin() const;
+ typename std::vector<ComponentPtr>::const_iterator end() const;
+ typename std::vector<ComponentPtr>::const_iterator cbegin() const;
+ typename std::vector<ComponentPtr>::const_iterator cend() const;
+
+private:
+ std::vector<ComponentPtr> _components{};
+
+ bool _finalized{false};
+
+ std::vector<const ITensorInfo *> _argument_tensors{};
+ std::set<const ITensorInfo *> _input_tensors{};
+ std::set<const ITensorInfo *> _interm_tensors{};
+ const ITensorInfo *_any_output_tensor{nullptr};
+ std::vector<const ITensorInfo *> _tiles{};
+ std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
new file mode 100644
index 0000000000..8042e3dd08
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentStream.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context,
+ GpuComponentServices *services,
+ const MemoryDescriptorMap &mem_map)
+ : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map}
+{
+}
+
+GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
+{
+ GpuWorkloadSourceCode source_code;
+ // Traverse through component groups and assemble workload together
+ for (auto &&group : _component_groups)
+ {
+ group.finalize();
+
+ // Write kernel code
+ GpuLogicalKernel logical_kernel(_services, group);
+ const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
+ // The whole unit workload stage is determined by the root component
+ const auto unit_workload_stage = group.get_root_component()->properties().stage();
+ source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map, _context);
+ }
+ return source_code;
+}
+
+void GpuKernelComponentStream::new_component_group()
+{
+ _component_groups.emplace_back();
+}
+
+bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component)
+{
+ ARM_COMPUTE_ERROR_ON(_component_groups.empty());
+ return _component_groups.back().add_component(component);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
new file mode 100644
index 0000000000..ef8a8a15b0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph
+ * Each component group in the stream denotes a complete kernel that may consist of multiple components
+ *
+ * The main purposes of this class are:
+ * - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream
+ * - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode
+ */
+class GpuKernelComponentStream
+{
+public:
+ /** Constructor
+ *
+ * @param[in] context @ref GpuWorkloadContext to be used throughout the stream
+ * @param[in] services @ref GpuComponentServices to be used throughout the stream
+ * @param[in] mem_map @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
+ */
+ GpuKernelComponentStream(GpuWorkloadContext *context,
+ GpuComponentServices *services,
+ const MemoryDescriptorMap &mem_map);
+ /** Allow instances of this class to be copy constructed */
+ GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
+ /** Allow instances of this class to be copied */
+ GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default;
+ /** Allow instances of this class to be moved */
+ GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default;
+ /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */
+ GpuWorkloadSourceCode write_workload_code();
+ /** Insert a new component group in the stream.
+ * Subsequent components are added to this group until end of stream or the next new_component_group is called
+ */
+ void new_component_group();
+ /** Add a component to the previously created component group
+ * Throw an error if no component group is present in the stream
+ *
+ * @param[in] component Component to be inserted
+ *
+ * @return true If the operation is successful
+ * @return false Otherwise
+ */
+ bool add_component(IGpuKernelComponent *component);
+
+private:
+ GpuWorkloadContext *_context;
+ GpuComponentServices *_services;
+ std::vector<GpuKernelComponentGroup> _component_groups{};
+ MemoryDescriptorMap _mem_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
new file mode 100644
index 0000000000..11d916eec9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <deque>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** The argument list of a @ref GpuKernelSourceCode */
+using GpuKernelArgumentList = std::deque<GpuKernelArgumentBinding>;
+
+/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
+ */
+class GpuKernelSourceCode
+{
+public:
+ /** Set kernel name */
+ GpuKernelSourceCode &name(const std::string &n)
+ {
+ _name = n;
+ return *this;
+ }
+ /** Set kernel code */
+ GpuKernelSourceCode &code(const std::string &c)
+ {
+ _code = c;
+ return *this;
+ }
+ /** Set kernel config id string */
+ GpuKernelSourceCode &config_id(const std::string &c_id)
+ {
+ _config_id = c_id;
+ return *this;
+ }
+ /** Set kernel build options */
+ GpuKernelSourceCode &build_options(const CLBuildOptions &b_options)
+ {
+ _build_options = b_options;
+ return *this;
+ }
+ /** Set kernel execution window */
+ GpuKernelSourceCode &window(const Window &window)
+ {
+ _window = window;
+ return *this;
+ }
+ /** Set kernel argument list */
+ GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments)
+ {
+ _arguments = arguments;
+ return *this;
+ }
+ /** Get kernel name */
+ std::string name() const
+ {
+ return _name;
+ }
+ /** Get kernel code */
+ std::string code() const
+ {
+ return _code;
+ }
+ /** Get kernel config id string */
+ std::string config_id() const
+ {
+ return _config_id;
+ }
+ /** Get kernel build options */
+ const CLBuildOptions &build_options() const
+ {
+ return _build_options;
+ }
+ /** Get kernel execution window */
+ const Window &window() const
+ {
+ return _window;
+ }
+ /** Get kernel argument list */
+ const GpuKernelArgumentList &arguments() const
+ {
+ return _arguments;
+ }
+
+private:
+ std::string _name{};
+ std::string _code{};
+ std::string _config_id{};
+ CLBuildOptions _build_options{};
+ Window _window{};
+ GpuKernelArgumentList _arguments{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
new file mode 100644
index 0000000000..725a46e91c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuLogicalKernel.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components) // NOLINT
+ : _comp_group{std::move(components)}, _store_components{}
+{
+ ARM_COMPUTE_UNUSED(services);
+}
+
+GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
+{
+ GpuKernelSourceCode code;
+ GpuCkwDriver writer{_comp_group};
+
+ code.name(writer.get_name());
+ code.code(writer.get_code());
+ code.arguments(writer.get_kernel_arguments());
+ code.build_options(writer.get_build_options());
+ code.config_id(writer.get_config_id());
+ code.window(writer.get_window());
+
+ return code;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
new file mode 100644
index 0000000000..e2bc83b286
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A wrapper-processor of a @ref GpuKernelComponentGroup
+ * It adds the load (if any) and store components to the component group
+ * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code
+ */
+class GpuLogicalKernel
+{
+public:
+ /** Constructor
+ *
+ * @param[in] services @ref GpuComponentServices to be used
+ * @param[in] components Component group from which this logical kernel is initialized
+ */
+ explicit GpuLogicalKernel(GpuComponentServices *services, GpuKernelComponentGroup components); // NOLINT
+ /** Allow instances of this class to be copy constructed */
+ GpuLogicalKernel(const GpuLogicalKernel &) = default;
+ /** Allow instances of this class to be copied */
+ GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuLogicalKernel(GpuLogicalKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default;
+ /** Generate a @ref GpuKernelSourceCode */
+ GpuKernelSourceCode write_kernel_code();
+
+private:
+ GpuKernelComponentGroup _comp_group{};
+ std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
new file mode 100644
index 0000000000..aec8b9db4f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+ std::vector<DependencyGraph::TensorId> tensor_ids{};
+ std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+ [](const auto &t) { return t->id(); });
+ return tensor_ids;
+}
+
+} // namespace
+
+Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
+ : _id{id}, _operator_type{operator_type}, _tensors{tensors}
+{
+}
+
+OperatorId Operator::id() const
+{
+ return _id;
+}
+
+GpuOperatorType Operator::operator_type() const
+{
+ return _operator_type;
+}
+
+ArgumentPack<ITensorInfo> Operator::tensors() const
+{
+ return _tensors;
+}
+
+bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) const
+{
+ const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+ const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+ // Constraint 1
+ if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
+ {
+ return false;
+ }
+ // Constraint 2
+ if (_operators.size() >= max_fused_operators)
+ {
+ return false;
+ }
+ // Constraint 3.1: Pattern: (Unfusable)
+ if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+ {
+ return false;
+ }
+ // Constraint 3.2
+ if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+ {
+ return false;
+ }
+ // Constraint 4
+ if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+ {
+ return false;
+ }
+ // Constraint 5
+ if (_operators.size() > 0)
+ {
+ const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor = root_dst_tensors[0];
+ const auto dst_tensors = op.tensors().get_const_dst_tensors();
+ for (const auto &t : root_dst_tensors)
+ {
+ if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ for (const auto &t : dst_tensors)
+ {
+ if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ }
+ // Constraint 6
+ if (_operators.size() > 0)
+ {
+ const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+ const auto dst_tensors = op.tensors().get_const_dst_tensors();
+ for (const auto &t : root_dst_tensors)
+ {
+ if (t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ for (const auto &t : dst_tensors)
+ {
+ if (t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+void GpuOperatorGroup::add_operator(const Operator &op, bool is_output)
+{
+ ARM_COMPUTE_ERROR_ON(!try_add_operator(op, is_output));
+ const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+ const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+ _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
+ _operators[op.id()] = op;
+}
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type,
+ const ArgumentPack<ITensorInfo> &tensors) const
+{
+ auto new_id = static_cast<OperatorId>(_operators.size());
+ return Operator{new_id, operator_type, tensors};
+}
+const Operator *GpuOperatorGroup::get_root_operator() const
+{
+ const auto roots = _graph.get_root_ops();
+ ARM_COMPUTE_ERROR_ON(roots.size() > 1);
+ if (roots.empty())
+ {
+ return nullptr;
+ }
+ return &_operators.at(roots[0]);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
new file mode 100644
index 0000000000..0a2369d357
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using OperatorId = DependencyGraph::OperatorId;
+
+/** An operator for the sole purpose of validating fusion
+ */
+class Operator
+{
+public:
+ /** Default constructor */
+ Operator() = default;
+ /** Get Operator Id */
+ OperatorId id() const;
+ /** Get operator type */
+ GpuOperatorType operator_type() const;
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const;
+ friend class GpuOperatorGroup;
+
+private:
+ Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors);
+ OperatorId _id{};
+ GpuOperatorType _operator_type{};
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+
+/** A linear sequence of operators to be fused in a workload
+ * For the time being, this class is only used for validating operator fusion
+ * INVARIANTS:
+ * @note These invariants are exactly the same as operator fusion constraints
+ * 1. Fusion is limited to a linear sequence of operators
+ * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators
+ * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable
+ * 4. All operator but unfusable, have exactly 1 dst tensor
+ * 5. All fused operators share the same dst tensor shape
+ * 6. All fused operators' tensors share the same @ref DataLayout
+ */
+class GpuOperatorGroup
+{
+public:
+ static constexpr size_t max_fused_operators = 32;
+ /** Try adding (without actually adding) an operator to the group
+ *
+ * @param[in] op Operator to be added
+ * @param[in] is_output Whether this operator is the output operator.
+ *
+ * @return true If @p op can be added while maintaining the invariants
+ * @return false Otherwise
+ */
+ bool try_add_operator(const Operator &op, bool is_output = false) const;
+ /** Add an operator to the group
+ *
+ * @param[in] op Operator to be added
+ * @param[in] is_output Whether this operator is the output operator.
+ */
+ void add_operator(const Operator &op, bool is_output = false);
+ /** Create a new operator
+ *
+ * @param[in] operator_type @ref GpuOperatorType of the new operator
+ * @param[in] tensors Tensor arguments to the new operator
+ *
+ * @return Operator
+ */
+ Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const;
+ /** Get the "root operator" of the group, which is the first operator in a linear sequence
+ * @return const Operator* Pointer to the root operator
+ */
+ const Operator *get_root_operator() const;
+
+private:
+ DependencyGraph _graph{};
+ std::map<OperatorId, Operator> _operators{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
new file mode 100644
index 0000000000..c77697c343
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain properties common to all operator types */
+
+/** Operator type in the context of fusion
+ */
+enum class GpuOperatorType
+{
+ /** Simple operators are operators that:
+ * 1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise
+ * 2. Have exactly 1 output
+ */
+ Simple,
+ /** Complex operators are operators that are not simple but are still fusable with simple ones
+ */
+ Complex,
+ /** Unfusable operators are operators that cannot be fused with any other types of operators
+ */
+ Unfusable
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
new file mode 100644
index 0000000000..fab18aabb4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
+ : _impl{std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx)}
+{
+}
+
+GpuWorkloadContext::~GpuWorkloadContext() = default;
+
+GpuWorkloadContext::GpuWorkloadContext(GpuWorkloadContext &&other) = default;
+
+GpuWorkloadContext &GpuWorkloadContext::operator=(GpuWorkloadContext &&other) = default;
+
+GpuTarget GpuWorkloadContext::gpu_target() const
+{
+ return _impl->cl_compile_context()->get_gpu_target();
+}
+
+GpuLanguage GpuWorkloadContext::gpu_language() const
+{
+ return _impl->gpu_language();
+}
+
+const CLCompileContext *GpuWorkloadContext::cl_compile_context() const
+{
+ return _impl->cl_compile_context();
+}
+
+void GpuWorkloadContext::register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info)
+{
+ _impl->register_user_tensor(std::move(tensor_info));
+}
+
+GpuWorkloadContext::Impl &GpuWorkloadContext::implementation()
+{
+ return *_impl;
+}
+
+const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const
+{
+ return *_impl;
+}
+
+GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
+ : _gpu_language(gpu_language),
+ _cl_compile_ctx(cl_compile_ctx),
+ _next_tensor_id(1),
+ _mem_map(),
+ _managed_tensor_info()
+{
+}
+
+GpuLanguage GpuWorkloadContext::Impl::gpu_language() const
+{
+ return _gpu_language;
+}
+
+const CLCompileContext *GpuWorkloadContext::Impl::cl_compile_context() const
+{
+ return _cl_compile_ctx;
+}
+
+const MemoryDescriptorMap &GpuWorkloadContext::Impl::mem_map() const
+{
+ return _mem_map;
+}
+
+void GpuWorkloadContext::Impl::register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info)
+{
+ ARM_COMPUTE_ERROR_ON(tensor_info->has_valid_id());
+
+ const auto tensor_id = next_tensor_id();
+
+ tensor_info->set_id(tensor_id);
+ _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User};
+ // Save a *copy* of the user tensor info in workload context for future reference
+ // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
+ _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor()
+{
+ auto tensor_info = std::make_unique<TensorInfo>();
+ const auto tensor_id = -next_tensor_id();
+ tensor_info->set_id(tensor_id);
+ _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual};
+ auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+ return inserted.first->second.get();
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo &itensor_info)
+{
+ auto tensor_info = std::make_unique<TensorInfo>(itensor_info);
+ const auto tensor_id = next_tensor_id();
+ tensor_info->set_id(tensor_id);
+ _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}};
+ auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
+ return inserted.first->second.get();
+}
+
+ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id)
+{
+ return _managed_tensor_info.at(id).get();
+}
+
+const ITensorInfo *GpuWorkloadContext::Impl::get_tensor_info(ITensorInfo::Id id) const
+{
+ return _managed_tensor_info.at(id).get();
+}
+
+ITensorInfo::Id GpuWorkloadContext::Impl::next_tensor_id()
+{
+ return _next_tensor_id++;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
new file mode 100644
index 0000000000..b3571a6480
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of workload context. */
+class GpuWorkloadContext::Impl
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu_language Target GPU language.
+ * @param[in] cl_compile_ctx CL compile context.
+ */
+ Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx);
+
+ /** Copy constructor */
+ Impl(Impl &) = default;
+
+ /** Assignment */
+ Impl &operator=(Impl &) = default;
+
+ /** Get target GPU language. */
+ GpuLanguage gpu_language() const;
+
+ /** Get CL compile context. */
+ const CLCompileContext *cl_compile_context() const;
+
+ /** Get memory descriptor registry. */
+ const MemoryDescriptorMap &mem_map() const;
+
+ /** Set a new ID and register the user tensor info.
+ *
+ * The ownership of the tensor info object will be transfered to this context object.
+ *
+ * @param[in] tensor_info The tensor info to be registered.
+ */
+ void register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info);
+
+ /** Create a virtual (see @ref MemoryType) tensor info and save it
+ *
+ * @return ITensorInfo* The created virtual tensor info object pointer
+ */
+ ITensorInfo *create_virtual_tensor();
+ /** Create an auxiliary (see @ref MemoryType) tensor info and save it
+ *
+ * @param[in] tensor_info @ref ITensorInfo to copy from
+ *
+ * @return ITensorInfo* The created auxiliary tensor info object pointer
+ */
+ ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info);
+
+ /** Get tensor info created by this context, from id */
+ ITensorInfo *get_tensor_info(ITensorInfo::Id id);
+
+ /** Get tensor info created by this context, from id */
+ const ITensorInfo *get_tensor_info(ITensorInfo::Id id) const;
+
+private:
+ ITensorInfo::Id next_tensor_id();
+
+ GpuLanguage _gpu_language;
+ CLCompileContext *_cl_compile_ctx;
+
+ ITensorInfo::Id _next_tensor_id;
+ MemoryDescriptorMap _mem_map;
+ std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXTIMPL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
new file mode 100644
index 0000000000..357cb48a84
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique<Implementation>(context)}
+{
+}
+
+GpuWorkloadSketch::~GpuWorkloadSketch()
+{
+}
+
+GpuWorkloadSketch::GpuWorkloadSketch(GpuWorkloadSketch &&) = default;
+
+GpuWorkloadSketch &GpuWorkloadSketch::operator=(GpuWorkloadSketch &&) = default;
+
+const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const
+{
+ return _impl->context();
+}
+
+GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context()
+{
+ return _impl->context();
+}
+
+GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation()
+{
+ return *_impl;
+}
+
+const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const
+{
+ return *_impl;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
new file mode 100644
index 0000000000..04e294eacc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of @ref GpuWorkloadSketch */
+class GpuWorkloadSketch::Implementation
+{
+public:
+ /** Constructor
+ *
+ * @param[in] context global workload creation context
+ */
+ explicit Implementation(Context *context)
+ : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{}
+ {
+ }
+ /** Prevent instances of this class from being copy constructed */
+ Implementation(const Implementation &impl) = delete;
+ /** Prevent instances of this class from being copied */
+ Implementation &operator=(const Implementation &impl) = delete;
+ /** Allow instances of this class to be move constructed */
+ Implementation(Implementation &&impl) = default;
+ /** Allow instances of this class to be moved */
+ Implementation &operator=(Implementation &&impl) = default;
+ /** Get workload context */
+ const Context *context() const
+ {
+ return _context;
+ }
+ /** Get workload context */
+ Context *context()
+ {
+ return _context;
+ }
+ /** Get component graph */
+ const GpuKernelComponentGraph &component_graph() const
+ {
+ return _component_graph;
+ }
+ /** Get component graph */
+ GpuKernelComponentGraph &component_graph()
+ {
+ return _component_graph;
+ }
+ /** Get operator group */
+ const GpuOperatorGroup &operator_group() const
+ {
+ return _operator_group;
+ }
+ /** Get operator group */
+ GpuOperatorGroup &operator_group()
+ {
+ return _operator_group;
+ }
+ /** Generate @ref GpuWorkloadSourceCode from the workload sketch
+ * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
+ *
+ * @return GpuWorkloadSourceCode The generated workload code
+ */
+ GpuWorkloadSourceCode generate_source_code() const
+ {
+ const auto mem_map = _context->implementation().mem_map();
+ return component_graph().fuse(mem_map).write_workload_code();
+ }
+ /** Create a virtual (see @ref MemoryType) tensor info and save it
+ *
+ * @return ITensorInfo* The created virtual tensor info object pointer
+ */
+ ITensorInfo *create_virtual_tensor()
+ {
+ return _context->implementation().create_virtual_tensor();
+ }
+ /** Create an auxiliary (see @ref MemoryType) tensor info and save it
+ *
+ * @param[in] tensor_info @ref ITensorInfo to copy from
+ *
+ * @return ITensorInfo* The created auxiliary tensor info object pointer
+ */
+ ITensorInfo *create_auxiliary_tensor(const ITensorInfo &tensor_info)
+ {
+ return _context->implementation().create_auxiliary_tensor(tensor_info);
+ }
+
+ ITensorInfo *get_tensor_info(ITensorInfo::Id id)
+ {
+ return _context->implementation().get_tensor_info(id);
+ }
+
+private:
+ Context *_context;
+ GpuComponentServices _comp_services;
+ GpuKernelComponentGraph _component_graph;
+ GpuOperatorGroup _operator_group;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL_H
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
new file mode 100644
index 0000000000..5d75bcaaa0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Extract kernel arguments of one tensor from a flat list of kernel arguments.
+ *
+ * @param[in] flat_kernel_args
+ * @return GpuKernelArgumentList
+ */
+GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
+{
+ if (flat_kernel_args.empty())
+ {
+ return {};
+ }
+ GpuKernelArgumentList tensor_kargs{};
+
+ const GpuKernelArgumentBinding &karg_head = flat_kernel_args.front();
+ tensor_kargs.push_back(karg_head);
+ flat_kernel_args.pop_front();
+ const auto tensor_id = karg_head.id();
+
+ while (!flat_kernel_args.empty())
+ {
+ const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
+ if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+ {
+ return tensor_kargs;
+ }
+ tensor_kargs.push_back(karg);
+ flat_kernel_args.pop_front();
+ }
+ return tensor_kargs;
+}
+} // namespace
+/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
+using UnitWorkloadId = int32_t;
+
+/** Describes all the info related to a **workload argument** (tensor) in order to:
+ * - be used by runtime to configure gpu kernel argument
+ * - be used by memory managers to allocate required memory
+ */
+class GpuWorkloadArgument
+{
+public:
+ /** Default constructor */
+ GpuWorkloadArgument() = default;
+ /** Constructor
+ *
+ * @param[in] tensor_info @ref ITensorInfo of the workload argument
+ * @param[in] mem_desc @ref MemoryDescriptor of the workload argument
+ * @param[in] kernel_args @ref GpuKernelArgumentList of the workload argument
+ */
+ GpuWorkloadArgument(const ITensorInfo &tensor_info,
+ const MemoryDescriptor &mem_desc,
+ const GpuKernelArgumentList &kernel_args)
+ : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
+ {
+ }
+ /** Get tensor id within workload */
+ ITensorInfo::Id id() const
+ {
+ return _tensor_info.id();
+ }
+ /** Get @ref ITensorInfo of the argument */
+ ITensorInfo *tensor_info()
+ {
+ return &_tensor_info;
+ }
+ /** Get @ref ITensorInfo of the argument */
+ const ITensorInfo *tensor_info() const
+ {
+ return &_tensor_info;
+ }
+ /** Get @ref MemoryDescriptor of the argument */
+ MemoryDescriptor *memory_descriptor()
+ {
+ return &_mem_desc;
+ }
+ /** Get @ref MemoryDescriptor of the argument */
+ const MemoryDescriptor *memory_descriptor() const
+ {
+ return &_mem_desc;
+ }
+ /** Get @ref GpuKernelArgumentList of the workload tensor */
+ GpuKernelArgumentList *kernel_argument_list()
+ {
+ return &_kernel_args;
+ }
+ /** Get @ref GpuKernelArgumentList of the workload tensor */
+ const GpuKernelArgumentList *kernel_argument_list() const
+ {
+ return &_kernel_args;
+ }
+ /** Check if the workload argument has valid id
+ *
+ * @return true If has valid id
+ * @return false Otherwise
+ */
+ bool has_valid_id() const
+ {
+ return _tensor_info.has_valid_id();
+ }
+
+private:
+ TensorInfo _tensor_info{};
+ MemoryDescriptor _mem_desc{};
+ GpuKernelArgumentList _kernel_args{};
+};
+
+/** Describes when a unit workload is run.
+ */
+struct UnitWorkloadStage
+{
+ enum class Stage
+ {
+ Prepare, /**< Only run once at the beginning. */
+ Run, /**< Run every time after the first time. */
+ };
+ Stage stage{Stage::Run};
+};
+
+inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+{
+ return stage0.stage == stage1.stage;
+}
+
+/** The atomic unit in a Gpu workload. It contains exactly one kernel to run.
+ */
+class GpuUnitWorkload
+{
+public:
+ /** Default constructor */
+ GpuUnitWorkload() = default;
+ /** Constructor
+ *
+ * @param[in] id Id that uniquely identifies this unit workload in a workload
+ * @param[in] kernel_code @ref GpuKernelSourceCode contained within
+ * @param[in] stage Stage of the unit workload
+ */
+ GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
+ : _id{id}, _kernel_code{kernel_code}, _stage{stage}
+ {
+ }
+ /** Get the id of the unit workload */
+ UnitWorkloadId id() const
+ {
+ return _id;
+ }
+ /** Get reference to the underlying @ref GpuKernelSourceCode */
+ const GpuKernelSourceCode &code() const
+ {
+ return _kernel_code;
+ }
+ /** Get the stage of the unit workload */
+ UnitWorkloadStage stage() const
+ {
+ return _stage;
+ }
+
+private:
+ UnitWorkloadId _id{};
+ GpuKernelSourceCode _kernel_code{};
+ UnitWorkloadStage _stage{};
+};
+
+/** Hold the generated kernel source code and other information required to compile and run the workload.
+ */
+class GpuWorkloadSourceCode
+{
+public:
+ /** Default constructor */
+ GpuWorkloadSourceCode() = default;
+ /** Add a unit workload to the workload code
+ *
+ * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
+ * @param[in] stage Stage of the unit workload
+ * @param[in] mem_map @ref MemoryDescriptor map for all tensors within the unit workload
+ * @param[in] context @ref GpuWorkloadContext associated with the unit workload
+ *
+ * @return UnitWorkloadId Allocated unit workload id
+ */
+ UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code,
+ const UnitWorkloadStage &stage,
+ const MemoryDescriptorMap &mem_map,
+ const GpuWorkloadContext *context)
+ {
+ // Use the size of the kernel codes as Id
+ const auto uwk_id = static_cast<UnitWorkloadId>(_unit_workloads.size());
+ const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
+ _unit_workloads.push_back(unit_work);
+
+ GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
+ GpuKernelArgumentList tensor_kargs{};
+ while (true)
+ {
+ tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
+ if (tensor_kargs.empty())
+ {
+ break;
+ }
+ else
+ {
+ const auto tensor_id = tensor_kargs.at(0).id();
+ _workload_arguments[tensor_id] = GpuWorkloadArgument{
+ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs};
+ if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+ {
+ _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
+ }
+ _tensor_uwork_map[tensor_id].insert(uwk_id);
+ }
+ }
+
+ return uwk_id;
+ }
+ /** Get a unit workload from its id */
+ const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const
+ {
+ ARM_COMPUTE_ERROR_ON(id < 0);
+ return _unit_workloads.at(id);
+ }
+ /** Get all unit workloads sorted in topological order */
+ std::vector<UnitWorkloadId> unit_workloads() const
+ {
+ std::vector<UnitWorkloadId> ids{};
+
+ for (const auto &uwk : _unit_workloads)
+ {
+ ids.push_back(uwk.id());
+ }
+ return ids;
+ }
+ /** Get a @ref GpuWorkloadArgument from its associated tensor id */
+ const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const
+ {
+ return &_workload_arguments.at(t_id);
+ }
+ /** Get all tensors in the entire workload */
+ std::vector<ITensorInfo::Id> tensors() const
+ {
+ std::vector<ITensorInfo::Id> ids{};
+ for (const auto &id_tensor : _workload_arguments)
+ {
+ ids.push_back(id_tensor.first);
+ }
+ return ids;
+ }
+ /** Get all unit workloads connected to the tensor with @p t_id */
+ std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const
+ {
+ const auto unit_work_set = _tensor_uwork_map.at(t_id);
+ return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end());
+ }
+
+private:
+ std::vector<GpuUnitWorkload> _unit_workloads{};
+ std::map<ITensorInfo::Id, GpuWorkloadArgument> _workload_arguments{};
+ std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE_H
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
new file mode 100644
index 0000000000..84972501de
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** An interface that can write a gpu kernel
+ */
+class IGpuKernelWriter
+{
+public:
+ /** Destructor */
+ virtual ~IGpuKernelWriter()
+ {
+ }
+ /** Generate kernel name */
+ virtual std::string get_name() = 0;
+ /** Generate kernel code */
+ virtual std::string get_code() = 0;
+ /** Generate build options */
+ virtual CLBuildOptions get_build_options()
+ {
+ return {};
+ }
+ /** Generate config id string of the entire kernel. This is used for tuning */
+ virtual std::string get_config_id() = 0;
+ /** Generate execution window */
+ virtual Window get_window() const = 0;
+ /** Get the flat list of arguments of the kernel*/
+ virtual GpuKernelArgumentList get_kernel_arguments()
+ {
+ return {};
+ }
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
new file mode 100644
index 0000000000..a42b39700c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
+#include "compute_kernel_writer/include/ckw/Error.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand tensor) : _tensor(tensor)
+{
+}
+
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile,
+ const ckw::TensorSampler &sampler)
+{
+ CKW_ASSERT(_tile == nullptr);
+
+ _tile = tile;
+ _sampler = sampler;
+
+ return *this;
+}
+
+bool GpuCkwComponentArgument::has_tensor() const
+{
+ return _tensor.is_valid();
+}
+
+ckw::TensorOperand &GpuCkwComponentArgument::tensor()
+{
+ CKW_ASSERT(_tensor.is_valid());
+
+ return _tensor;
+}
+
+const ckw::TensorOperand &GpuCkwComponentArgument::tensor() const
+{
+ CKW_ASSERT(_tensor.is_valid());
+
+ return _tensor;
+}
+
+bool GpuCkwComponentArgument::has_tile() const
+{
+ return _tile.is_valid();
+}
+
+ckw::TileOperand &GpuCkwComponentArgument::tile()
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _tile;
+}
+
+const ckw::TileOperand &GpuCkwComponentArgument::tile() const
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _tile;
+}
+
+ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler()
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _sampler;
+}
+
+const ckw::TensorSampler &GpuCkwComponentArgument::tensor_sampler() const
+{
+ CKW_ASSERT(_tile.is_valid());
+
+ return _sampler;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
new file mode 100644
index 0000000000..7a57c81e5f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
+
+#include "compute_kernel_writer/include/ckw/TensorOperand.h"
+#include "compute_kernel_writer/include/ckw/TensorSampler.h"
+#include "compute_kernel_writer/include/ckw/TileOperand.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** The argument of a dynamic fusion component which can be either user tensor or virtual tensor. */
+class GpuCkwComponentArgument
+{
+public:
+ /** Default constructor */
+ GpuCkwComponentArgument() = default;
+
+ /** Initialize a new instance of @ref GpuCkwComponentArgument class for user tensor.
+ *
+ * @param[in] tensor The user tensor.
+ */
+ explicit GpuCkwComponentArgument(ckw::TensorOperand tensor);
+
+ /** Bind the tile and sampler to the tensor argument.
+ *
+ * This method can be used to share a tile and sampler associated to a tensor
+ * among different kernel components. For example, when we create the destination
+ * tile and destination sampler for the first time (root component), this method can be
+ * used to bind these two information to the destination tensor so that the following
+ * simple components know the tile size and how to access the elements from memory.
+ *
+ * @param[in] tile The tile that has been loaded.
+ * @param[in] sampler The tensor sampling information that has been used to load the tile.
+ */
+ GpuCkwComponentArgument &init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorSampler &sampler);
+
+ /** Get whether the argument is a user tensor. */
+ bool has_tensor() const;
+
+ /** Get the tensor operand.
+ *
+ * If the tensor is not available, throw an error.
+ */
+ ckw::TensorOperand &tensor();
+
+ /** Get the tensor operand.
+ *
+ * If the tensor is not available, throw an error.
+ */
+ const ckw::TensorOperand &tensor() const;
+
+ /** Get whether the argument contains a tile.
+ *
+ * The argument can be either a user tensor that has been loaded,
+ * or a virtual tensor (i.e. a tile with tensor sampling information).
+ */
+ bool has_tile() const;
+
+ /** Get the tile operand.
+ *
+ * If the tile is not available, throw an error.
+ */
+ ckw::TileOperand &tile();
+
+ /** Get the tile operand.
+ *
+ * If the tile is not available, throw an error.
+ */
+ const ckw::TileOperand &tile() const;
+
+ /** Get the tensor sampling information for the tile.
+ *
+ * If the tile is not available, throw an error.
+ */
+ ckw::TensorSampler &tensor_sampler();
+
+ /** Get the tensor sampling information for the tile.
+ *
+ * If the tile is not available, throw an error.
+ */
+ const ckw::TensorSampler &tensor_sampler() const;
+
+private:
+ ckw::TensorOperand _tensor{};
+ ckw::TileOperand _tile{};
+ ckw::TensorSampler _sampler{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWCOMPONENTARGUMENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
new file mode 100644
index 0000000000..a0e5e16aa0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/TargetArchitecture.h"
+#include "compute_kernel_writer/include/ckw/types/TargetLanguage.h"
+
+using namespace ckw;
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
+{
+ _components = components;
+
+ // Generate kernel name
+ std::string kernel_name;
+ for (auto &comp : _components)
+ {
+ auto ckw_driver = comp->ckw_component_driver();
+ ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+ kernel_name += ckw_driver->get_name(_components) + "__";
+ }
+
+ // Generate kernel code
+ auto root_writer =
+ KernelWriter::create_instance(ckw::TargetArchitecture::GpuArmMaliValhall, ckw::TargetLanguage::OpenCL);
+ GpuCkwScopedKernelWriter writer(root_writer.get());
+ GpuCkwVariableTable vtable{};
+
+ for (auto &comp : _components)
+ {
+ auto ckw_driver = comp->ckw_component_driver();
+ ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+ ckw_driver->write_component_code(_components, vtable, writer);
+ }
+ auto kernel = root_writer->emit_kernel(kernel_name);
+
+ // Set the kernel name, kernel arguments and source code
+ _kernel_name = kernel_name;
+ _kernel_args = kernel->arguments();
+ _kernel_code = kernel->source_code();
+}
+
+std::string GpuCkwDriver::get_name()
+{
+ return _kernel_name;
+}
+
+std::string GpuCkwDriver::get_code()
+{
+ return _kernel_code;
+}
+
+std::string GpuCkwDriver::get_config_id()
+{
+ std::string id;
+ for (auto &comp : _components)
+ {
+ auto ckw_driver = comp->ckw_component_driver();
+ ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
+ id = ckw_driver->get_tuner_id(_components) + "__";
+ }
+ return id;
+}
+
+Window GpuCkwDriver::get_window() const
+{
+ const auto root_comp = _components.get_root_component();
+ ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+ return root_comp->ckw_component_driver()->get_window();
+}
+
+GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
+{
+ GpuKernelArgumentList args{};
+ for (const auto &arg : _kernel_args)
+ {
+ switch (arg.type())
+ {
+ case KernelArgument::Type::TensorStorage:
+ {
+ args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_storage_type()));
+ break;
+ }
+ case KernelArgument::Type::TensorComponent:
+ {
+ args.emplace_back(static_cast<ITensorInfo::Id>(arg.id()), from_ckw(arg.tensor_component_type()));
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported KernelArgument Type");
+ break;
+ }
+ }
+ }
+ return args;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
new file mode 100644
index 0000000000..f8770920b7
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+
+#include "compute_kernel_writer/include/ckw/Kernel.h"
+#include "compute_kernel_writer/include/ckw/KernelArgument.h"
+#include <string>
+
+namespace arm_compute
+{
+/** Forward declarations */
+class Window;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use Kernel Writer to write kernel code
+ * Used by dynamic_fusion module
+ */
+class GpuCkwDriver : public IGpuKernelWriter
+{
+public:
+ /** Default constructor */
+ GpuCkwDriver() = delete;
+ /** Constructor
+ *
+ * @param[in] components Kernel component group from which the kernel will be generated
+ */
+ GpuCkwDriver(const GpuKernelComponentGroup &components);
+ /** Destructor */
+ ~GpuCkwDriver() override = default;
+ /** Generate kernel name */
+ std::string get_name() override;
+ /** Generate kernel code */
+ std::string get_code() override;
+ /** Generate config id string of the entire kernel. This is used for tuning */
+ std::string get_config_id() override;
+ /** Generate execution window */
+ Window get_window() const override;
+ /** Get the flat list of arguments of the kernel*/
+ GpuKernelArgumentList get_kernel_arguments() override;
+
+private:
+ GpuKernelComponentGroup _components{};
+ std::string _kernel_name{};
+ std::vector<ckw::KernelArgument> _kernel_args{};
+ std::string _kernel_code{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
new file mode 100644
index 0000000000..ae12d13e5a
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(ckw::KernelWriter *writer)
+ : _writer(writer), _parent_id_space(writer->id_space())
+{
+ _writer->new_id_space();
+}
+
+GpuCkwScopedKernelWriter::GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other)
+ : _writer(other._writer), _parent_id_space(other._writer->id_space())
+{
+ _writer->new_id_space();
+}
+
+ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->()
+{
+ return _writer;
+}
+
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::operator->() const
+{
+ return _writer;
+}
+
+ckw::KernelWriter *GpuCkwScopedKernelWriter::writer()
+{
+ return _writer;
+}
+
+const ckw::KernelWriter *GpuCkwScopedKernelWriter::writer() const
+{
+ return _writer;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
new file mode 100644
index 0000000000..84dd706cd0
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+/** Helper to automatically manage kernel writer ID space. */
+class GpuCkwScopedKernelWriter
+{
+public:
+ /** Initialize a new instance of @ref GpuCkwScopedKernelWriter class. */
+ explicit GpuCkwScopedKernelWriter(ckw::KernelWriter *writer);
+
+ /** Create a new scope from the specified scoped kernel writer. */
+ GpuCkwScopedKernelWriter(const GpuCkwScopedKernelWriter &other);
+
+ /** Assignment is disallowed. */
+ GpuCkwScopedKernelWriter &operator=(const GpuCkwScopedKernelWriter &) = delete;
+
+ /** Access the underlying kernel writer. */
+ ckw::KernelWriter *operator->();
+
+ /** Access the underlying kernel writer. */
+ const ckw::KernelWriter *operator->() const;
+
+ /** Get the kernel writer. */
+ ckw::KernelWriter *writer();
+
+ /** Get the kernel writer. */
+ const ckw::KernelWriter *writer() const;
+
+private:
+ ckw::KernelWriter *_writer;
+ int32_t _parent_id_space;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWSCOPEDKERNELWRITER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
new file mode 100644
index 0000000000..66ccc1ac34
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include <sstream>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+ GpuCkwScopedKernelWriter &writer,
+ const ITensorInfo *tensor,
+ const std::string &alias)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+
+ // Do not re-declare if the variable associated with the tensor has already been declared
+ auto it = _vars.find(tensor->id());
+
+ if (it != _vars.end())
+ {
+ return &it->second;
+ }
+ if (comp_group.is_intermediate_tensor(tensor))
+ {
+ // Create a virtual tensor variable
+ GpuCkwComponentArgument var;
+ auto &&inserted = _vars.emplace(tensor->id(), var);
+ return &(inserted.first->second);
+ }
+ else
+ {
+ // Create a user tensor variable
+ std::stringstream ss;
+ ss << alias << "_t" << abs(tensor->id());
+ const auto uniq_name = ss.str();
+ GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor))};
+ auto &&inserted = _vars.emplace(tensor->id(), var);
+ return &(inserted.first->second);
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
new file mode 100644
index 0000000000..fc8764c3e2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuKernelComponentGroup;
+class GpuCkwScopedKernelWriter;
+
+/** A table of all the variables used in the kernel.
+ *
+ * It determines whether we create an virtual tensor var or a user tensor var
+ * It avoids duplicating variables for the same tensors (Tensors with the same id)
+ * Each kernel has exactly one variable table.
+ */
+class GpuCkwVariableTable
+{
+public:
+ /** Declare a kernel component variable(argument) for the corresponding tensor info.
+ *
+ * @param[in] comp_group Component group the tensor belongs to
+ * @param[in] writer Compute Kernel Writer
+ * @param[in] tensor Tensor info with which the new variable is associated
+ * @param[in] alias Alias for the variable. Will be used as part of the variable name
+ *
+ * @return GpuCkwComponentArgument*
+ */
+ GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
+ GpuCkwScopedKernelWriter &writer,
+ const ITensorInfo *tensor,
+ const std::string &alias = "unnamed");
+
+private:
+ std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
new file mode 100644
index 0000000000..52e56e2e35
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
+
+#include "arm_compute/core/Window.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+
+namespace arm_compute
+{
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuCkwVariableTable;
+class GpuCkwScopedKernelWriter;
+
+/** An interface used by @ref GpuCkwDriver to write source code for a kernel component
+ *
+ * There are 3 main architecture layers for using Compute Kernel Writer (Ckw) inside ACL's dynamic fusion module
+ * From top level to bottom level:
+ * | Layer | Library
+ * ===========================
+ * | dynamic_fusion | acl
+ * | ckw_driver | acl
+ * | ckw | ckw
+ *
+ * ckw_driver is a glue layer that directs how fused code is produced using the ckw library
+ *
+ * There are two main groups within ckw_driver:
+ * - @ref GpuCkwDriver is a global driver that coordinates how the final fused code along with all the info necessary
+ * for run time execution is produced using ckw
+ * - Various classes implementing @ref IGpuCkwComponentDriver is a component driver that directs ckw to generate kernel component code (e.g. activation, store etc.)
+ *
+ * The overall flow goes like this:
+ * In dynamic_fusion module, @ref GpuLogicalKernel instantiates a @ref GpuCkwDriver from a @ref GpuKernelComponentGroup
+ * The logical kernel then uses the global driver's various interfaces to generate the code info.
+ * In particular, the @ref GpuCkwDriver::get_code() interface will call into each @ref IGpuCkwComponentDriver::write_component_code()
+ */
+class IGpuCkwComponentDriver
+{
+public:
+ using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ */
+ IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
+ {
+ }
+ /** Destructor */
+ virtual ~IGpuCkwComponentDriver()
+ {
+ }
+ /** Generate kernel component code
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ * @param[in, out] vtable Table of variables declared by each component
+ * @param[in, out] writer CKW writer that writes code scoped to this kernel component.
+ *
+ * @note @p writer can only be passed via value since the new scope is created in the copy constructor
+ */
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const = 0;
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const
+ {
+ return _tensors;
+ }
+ /** Generate the execution window for the component */
+ virtual Window get_window() const
+ {
+ return Window{};
+ }
+ /** Generate the name of the component
+ *
+ * This will be concatenated with other components' names to form the name of the kernel
+ */
+ virtual std::string get_name(const ComponentGroup &comp_group) const
+ {
+ ARM_COMPUTE_UNUSED(comp_group);
+ return "unnamed";
+ }
+ /** Generate the tuner id of the component
+ * This id should capture all the parameters that distinguish one kernel's lws tuning from another.
+ * e.g. two components that are identical in every other way, but have output tensor dimensions should
+ * have different tuner ids, because the lws of one may not be optimal on the other.
+ *
+ * This will be concatenated with other components' tuner id to form the tuner id of the kernel
+ */
+ virtual std::string get_tuner_id(const ComponentGroup &comp_group) const
+ {
+ ARM_COMPUTE_UNUSED(comp_group);
+ return "";
+ }
+ /** Get component id */
+ ComponentId id() const
+ {
+ return _id;
+ }
+
+private:
+ ComponentId _id{-1};
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
new file mode 100644
index 0000000000..18fda5bd6b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwActivation.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwActivation::GpuCkwActivation(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes) // NOLINT
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+ const auto dst_dt = to_ckw(_dst->data_type());
+
+ // CKW constants
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_neg_1_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f}}, dst_dt));
+ auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, dst_dt));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_A_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.a()}}, dst_dt));
+ auto const_B_fp = writer->declare_constant_tile(ckw::ConstantData({{_attributes.b()}}, dst_dt));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+
+ // Destination compute block size
+ int32_t dst_n0 = -1;
+ int32_t dst_m0 = -1;
+
+ // Destination compute block size left-over
+ int32_t dst_n0_partial = -1;
+ int32_t dst_m0_partial = -1;
+
+ // Shift-back for the overlapping-min strategy
+ int32_t dst_shift_back = -1;
+
+ if (!dst->has_tile())
+ {
+ // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+ // as tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ dst_n0 = root_window.x().step();
+ dst_m0 = root_window.y().step();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+ }
+ else
+ {
+ // dst_m0_partial depends on the TensorSamplerFormat
+ dst_n0 = dst->tile().tile_info().width();
+ dst_m0 = dst->tile().tile_info().height();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+ if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ }
+ else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ dst_m0_partial = _dst->dimension(1) % dst_m0;
+ }
+
+ // Shift-back for the overlapping-min strategy
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+ }
+
+ const auto &tile_dst = dst->tile();
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_dst_shift_back_n0 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ if (!src->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+ auto tile_gid_0 = writer->declare_tile("gid_0_src", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1_src", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2_src", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_nout0 = writer->declare_tile("nout0_src", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 =
+ writer->declare_tile("mout0_src", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1_src", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0_src", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_dst_n0, const_dst_shift_back_n0,
+ const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0);
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+
+ auto tile_src = writer->declare_tile("src", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+ src->init_virtual_tensor(tile_src, sampler_src);
+ }
+
+ const auto &tile_src = src->tile();
+
+ /********************************************************************************
+ * 7 - Write the rest of the code
+ ********************************************************************************/
+ switch (_attributes.activation())
+ {
+ case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+ {
+ // dst = src * -1
+ writer->op_binary(tile_dst, ckw::BinaryOp::Mul, tile_src, const_neg_1_fp);
+ // dst = exp(src * -1)
+ writer->op_unary(tile_dst, ckw::UnaryOp::Exp, tile_dst);
+ // dst = 1 + (exp(src * -1))
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, const_pos_1_fp);
+ // dst = 1 / 1 + (exp(src * -1))
+ writer->op_binary(tile_dst, ckw::BinaryOp::Div, const_pos_1_fp, tile_dst);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::TANH:
+ {
+ writer->op_unary(tile_dst, ckw::UnaryOp::Tanh, tile_src);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::RELU:
+ {
+ // dst = max(src, 0)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+ {
+ //dst = max(src, 0)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_0_fp);
+ //dst = min(max(src, 0), A_VAL)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
+ break;
+ }
+ case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+ {
+ //dst = max(src, B_VAL)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Max, tile_src, const_B_fp);
+ //dst = min(max(src, B_VAL), A_VAL)
+ writer->op_binary(tile_dst, ckw::BinaryOp::Min, tile_dst, const_A_fp);
+ break;
+ }
+ default:
+ CKW_ASSERT(false);
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwActivation::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+ // This is in line with the collapsing convention used by operators like Conv2d
+ output_shape.collapse(2U, 1U);
+ constexpr uint32_t vector_size_byte_opencl = 16;
+ const uint32_t num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+ Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
new file mode 100644
index 0000000000..386e933a72
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwActivation : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentActivation::Attributes;
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref GpuCkwActivation::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation);
+ /** Destructor */
+ ~GpuCkwActivation() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWACTIVATION */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
new file mode 100644
index 0000000000..d3e0dbafd4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwCast.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+ ARM_COMPUTE_ERROR_ON_MSG(is_data_type_float(_src->data_type()) == false,
+ "The source data type must be a floating-point data type");
+}
+
+void GpuCkwCast::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+
+ // CKW constants
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+
+ // Destination compute block size
+ int32_t dst_n0 = -1;
+ int32_t dst_m0 = -1;
+
+ // Destination compute block size left-over
+ int32_t dst_n0_partial = -1;
+ int32_t dst_m0_partial = -1;
+
+ // Shift-back for the overlapping-min strategy
+ int32_t dst_shift_back = -1;
+
+ if (!dst->has_tile())
+ {
+ // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+ // as tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ dst_n0 = root_window.x().step();
+ dst_m0 = root_window.y().step();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ ckw::DataType dst_dt = to_ckw(_dst->data_type());
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+ }
+ else
+ {
+ // Change dst_n0 and dst_m0 if NOT root component!
+ // ATTENTION:
+ // dst_m0_partial depends on the TensorSamplerFormat
+ dst_n0 = dst->tile().tile_info().width();
+ dst_m0 = dst->tile().tile_info().height();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ ckw::TensorSampler sampler_dst = dst->tensor_sampler();
+
+ if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+ }
+ else if (sampler_dst.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ dst_m0_partial = _dst->dimension(1) % dst_m0;
+ }
+
+ // Shift-back for the overlapping-min strategy
+ dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+ }
+
+ const auto &tile_dst = dst->tile();
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_dst_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ if (!src->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_src = dst->tensor_sampler();
+
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+ const_dst_shift_back_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+ ckw::DataType src_dt = to_ckw(_src->data_type());
+ auto tile_src = writer->declare_tile("src", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() it is used to bring the tile_src outside the compound statement
+ src->init_virtual_tensor(tile_src, sampler_src);
+ }
+
+ auto tile_src = src->tile();
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (optional)
+ ********************************************************************************/
+
+ // Not required
+
+ /********************************************************************************
+ * 7 - Write the rest of the code
+ ********************************************************************************/
+ // Only None ConvertPolicy is supported for floating-point data types
+ ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
+
+ writer->op_cast(tile_dst, tile_src, convert_policy);
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwCast::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+ // This is in line with the collapsing convention used by operators like Conv2d
+ output_shape.collapse(2U, 1U);
+ constexpr uint32_t vector_size_byte_opencl = 16;
+ const uint32_t num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+ Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
new file mode 100644
index 0000000000..2389301196
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwCast : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentCast::Attributes;
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentCast::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast);
+ /** Destructor */
+ ~GpuCkwCast() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWCAST */
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..cfccab186b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.cpp
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwDepthwiseConv2d::GpuCkwDepthwiseConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+ {
+ _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+ }
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _bia, _dst);
+}
+
+void GpuCkwDepthwiseConv2d::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ // Data Layout is NHWC
+ const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+ GpuCkwComponentArgument *bia = nullptr;
+
+ const bool using_bias = _bia != nullptr;
+
+ if (using_bias)
+ {
+ bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+ }
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+ const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx));
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+ const auto stride_x = static_cast<int32_t>(_attributes.stride().x());
+ const auto stride_y = static_cast<int32_t>(_attributes.stride().y());
+ const auto pad_x = static_cast<int32_t>(_attributes.pad().left);
+ const auto pad_y = static_cast<int32_t>(_attributes.pad().top);
+ const auto depth_multiplier = static_cast<int32_t>(_attributes.depth_multiplier());
+ const auto dilation_x = static_cast<int32_t>(_attributes.dilation().x());
+ const auto dilation_y = static_cast<int32_t>(_attributes.dilation().y());
+ const auto kernel_size = kernel_width * kernel_height;
+
+ // CKW constants
+ auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+ auto const_kernel_size_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+ auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+ auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+ auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+ auto const_depth_multiplier_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{depth_multiplier}}, ckw::DataType::Int32));
+ auto const_dilation_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_x}}, ckw::DataType::Int32));
+ auto const_dilation_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{dilation_y}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ const int32_t src_m0 = kernel_width + (dst_m0 - 1);
+ const int32_t src_n0 = depth_multiplier > 1 ? 1 : dst_n0;
+ const int32_t wei_m0 = kernel_width;
+ const int32_t wei_n0 = dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize the destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensors
+ ********************************************************************************/
+ // SOURCE SAMPLER
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::SkipLessThanZero);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // WEIGHTS SAMPLER
+ // We cannot have out-of-bounds accesses for the weights
+ ckw::TensorSampler sampler_wei;
+ sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ if (_settings.export_weights_to_cl_image())
+ {
+ sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+ }
+ else
+ {
+ sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+ }
+
+ // BIAS SAMPLER
+ ckw::TensorSampler sampler_bia;
+ sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+ sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (Optional)
+ ********************************************************************************/
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ auto tile_src_ci = writer->declare_tile("src_ci", ckw::DataType::Int32);
+ writer->op_binary(tile_src_ci, ckw::BinaryOp::Div, tile_cout0, const_depth_multiplier_i32);
+
+ auto tile_src_xi = writer->declare_tile("src_xi", ckw::DataType::Int32);
+ writer->op_binary(tile_src_xi, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+ writer->op_binary(tile_src_xi, ckw::BinaryOp::Sub, tile_src_xi, const_pad_x_i32);
+
+ auto tile_src_yi = writer->declare_tile("src_yi", ckw::DataType::Int32);
+ writer->op_binary(tile_src_yi, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+ writer->op_binary(tile_src_yi, ckw::BinaryOp::Sub, tile_src_yi, const_pad_y_i32);
+
+ // Loop variables
+ auto tile_yk = writer->declare_tile("yk", ckw::DataType::Int32);
+
+ writer->op_assign(tile_yk, const_0_i32);
+
+ // clang-format off
+ writer->op_for_loop(tile_yk, ckw::BinaryOp::Less, const_kernel_size_i32, tile_yk, ckw::AssignmentOp::Increment, const_kernel_w_i32,
+ [&]()
+ {
+ auto tile_src = writer->declare_tile("a", ckw::TileInfo(to_ckw(_src->data_type()), src_m0, src_n0));
+ auto tile_wei = writer->declare_tile("b", ckw::TileInfo(to_ckw(_wei->data_type()), wei_m0, wei_n0));
+
+ writer->op_assign(tile_src, const_0_fp);
+
+ auto tile_x_gte_0 = writer->declare_tile("x_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_y_gte_0 = writer->declare_tile("y_gte_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_x_lt_w = writer->declare_tile("x_lt_w", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_y_lt_h = writer->declare_tile("y_lt_h", ckw::TileInfo(ckw::DataType::Int32));
+
+ // Check if yi + yk * DILATION_Y is out-of-bound
+ writer->op_binary(tile_y_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_yi, const_0_i32);
+ writer->op_binary(tile_y_lt_h, ckw::BinaryOp::Less, tile_src_yi, const_src_h_i32);
+
+ auto tile_src_mi = writer->declare_tile("src_mi", ckw::TileInfo(ckw::DataType::Int32));
+
+ // Load src
+ for(int32_t xk = 0; xk < src_m0; ++xk)
+ {
+ auto const_xk_i32 = writer->declare_constant_tile(ckw::ConstantData({{xk}}, ckw::DataType::Int32));
+
+ // xi + xk * DILATION_X
+ writer->op_binary(tile_src_mi, ckw::BinaryOp::Mul, const_xk_i32, const_dilation_x_i32);
+ writer->op_binary(tile_src_mi, ckw::BinaryOp::Add, tile_src_mi, tile_src_xi);
+
+ // Check if xi + xk * DILATION_X is out-of-bound
+ writer->op_binary(tile_x_gte_0, ckw::BinaryOp::GreaterEqual, tile_src_mi, const_0_i32);
+ writer->op_binary(tile_x_lt_w, ckw::BinaryOp::Less, tile_src_mi, const_src_w_i32);
+
+ // Set mi to -1 if we have out-of-bound memory accesses
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_gte_0);
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_x_lt_w);
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_gte_0);
+ writer->op_ternary(tile_src_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_src_mi, tile_y_lt_h);
+
+ writer->op_load(tile_src.row(xk), src->tensor(), sampler_src, tile_src_ci, tile_src_mi, tile_src_yi, tile_bout0);
+ }
+
+ // Load wei
+ writer->op_load(tile_wei, wei->tensor(), sampler_wei, tile_cout0, tile_yk, const_0_i32, const_0_i32);
+
+ // Attention: MAC (Multiply-and-Accumulate) ternary operator is currently unsupported in CKW
+ // Therefore, this part should be replaced with the MAC ternary operator when availabe
+ auto tile_tmp = writer->declare_tile("tmp", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+ for(int32_t m0 = 0; m0 < dst_m0; ++m0)
+ {
+ for(int32_t xk = 0; xk < kernel_width; ++xk)
+ {
+ auto tile_a = tile_src.row(m0 + xk);
+ auto tile_b = tile_wei.row(xk);
+ auto tile_c = tile_dst.row(m0);
+
+ writer->op_binary(tile_tmp, ckw::BinaryOp::Mul, tile_a, tile_b);
+ writer->op_binary(tile_c, ckw::BinaryOp::Add, tile_c, tile_tmp);
+ }
+ }
+ writer->op_binary(tile_src_yi, ckw::BinaryOp::Add, tile_src_yi, const_dilation_y_i32);
+ });
+ // clang-format on
+
+ // Bias addition
+ // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
+ // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+ if (using_bias)
+ {
+ if (!bia->has_tile())
+ {
+ auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+ writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout0, const_0_i32, const_0_i32, const_0_i32);
+ bia->init_virtual_tensor(tile_bia, sampler_bia);
+ }
+ auto &tile_bia = bia->tile();
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwDepthwiseConv2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+ TensorShape output_shape = _dst->tensor_shape();
+
+ Window win = calculate_max_window(output_shape, Steps(_settings.n0(), _settings.m0()));
+ return win.collapse(win, Window::DimZ);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
new file mode 100644
index 0000000000..a15d3ee710
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+class GpuCkwDepthwiseConv2d : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentDepthwiseConv2d::Attributes;
+ using Settings = ClComponentDepthwiseConv2d::Settings;
+
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentDepthwiseConv2d::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ */
+ GpuCkwDepthwiseConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDepthwiseConv2d);
+ /** Destructor */
+ ~GpuCkwDepthwiseConv2d() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_wei;
+ const ITensorInfo *_bia;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
new file mode 100644
index 0000000000..eb4f644eb6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+using TileContainer = std::vector<std::vector<int32_t>>;
+
+GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _bia = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
+}
+
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const auto desc = _settings.direct_conv_descriptor();
+ ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
+ "Only the weights tensor can be exported to cl_image");
+
+ const uint32_t channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+ const uint32_t width_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_wei->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *wei = vtable.declare_variable(comp_group, writer, _wei, "wei");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+ GpuCkwComponentArgument *bia = nullptr;
+
+ const bool using_bias = _bia != nullptr;
+
+ if (using_bias)
+ {
+ bia = vtable.declare_variable(comp_group, writer, _bia, "bia");
+ }
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const auto kernel_height = static_cast<int32_t>(_wei->dimension(height_idx));
+ const auto kernel_width = static_cast<int32_t>(_wei->dimension(width_idx));
+ const auto src_c = static_cast<int32_t>(_src->dimension(channel_idx));
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_w = static_cast<int32_t>(_dst->dimension(width_idx));
+ const auto stride_x = static_cast<int32_t>(_attributes.stride().x());
+ const auto stride_y = static_cast<int32_t>(_attributes.stride().y());
+ const auto pad_x = static_cast<int32_t>(_attributes.pad().left);
+ const auto pad_y = static_cast<int32_t>(_attributes.pad().top);
+ const auto kernel_size = kernel_width * kernel_height;
+ const auto k0 =
+ static_cast<int32_t>(adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)));
+
+ // CKW constants
+ auto const_kernel_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{kernel_width}}, ckw::DataType::Int32));
+ auto const_kernel_size_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{kernel_size}}, ckw::DataType::Int32));
+ auto const_src_c_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_c}}, ckw::DataType::Int32));
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_w}}, ckw::DataType::Int32));
+ auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+ auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+ auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+ auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+ auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_neg_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{-1}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_src_c_i32_minus_k0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{src_c - k0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensors
+ ********************************************************************************/
+ // Exporting the weights tensor to an OpenCL image object is currently only supported when:
+ // a) k0 is equal to 4
+ // The current implementation expects to read a vector of 4 float values into the OpenCL image object.
+ // b) K is a multiple of 4
+ // This is a limitation in the current interface due to the variable table being responsible for maintaining
+ // information about the TensorStorageType rather than the TensorTileSampler. As a result, TensorStorageType cannot
+ // be reassigned, and we cannot use a texture object for the weights tensor in cases where we expect to have an
+ // extra loop to compute the left-over elements.
+ const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (src_c % 4 == 0);
+
+ // SOURCE SAMPLER
+ // - We cannot have out-of-bounds reads in the X dimension (mapped to the IFMs) as we have an extra loop to
+ // compute left-over elements
+ // - We cannot have out-of-bounds reads when the kernel height is equal to 1. In all other cases, we need to ensure the
+ // indirection buffer mi does not contain negative values representing out-of-bounds reads.
+ auto address_mode_y_src =
+ kernel_height == 1 ? ckw::TensorSamplerAddressModeY::None : ckw::TensorSamplerAddressModeY::SkipLessThanZero;
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(address_mode_y_src);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_src.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // WEIGHTS SAMPLER
+ // We cannot have out-of-bounds accesses for the weights
+ ckw::TensorSampler sampler_wei;
+ sampler_wei.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1); // 3rd dimension collapsed with 2nd dimension
+ sampler_wei.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_wei.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_wei.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ if (use_cl_image_for_weights)
+ {
+ sampler_wei.storage(ckw::TensorStorageType::Texture2dReadOnly);
+ }
+ else
+ {
+ sampler_wei.storage(ckw::TensorStorageType::BufferUint8Ptr);
+ }
+
+ // BIAS SAMPLER
+ ckw::TensorSampler sampler_bia;
+
+ if (using_bias)
+ {
+ sampler_bia.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ sampler_bia.address_mode_x(sampler_dst.address_mode_x());
+ sampler_bia.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_bia.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_bia.storage(ckw::TensorStorageType::BufferUint8Ptr);
+ }
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (optional)
+ ********************************************************************************/
+
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout = writer->declare_tile("cout", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout = writer->declare_tile("mout", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH x HEIGHT
+ auto tile_bout = writer->declare_tile("bout", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout, tile_gid_1, const_dst_m0_i32);
+ get_coordinate_from_gws(writer, tile_bout, tile_gid_2, const_pos_1_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ // We create a 2d container of size (dst_m0, 1) to store the indices for iteration
+ TileContainer it;
+ for (int32_t m = 0; m < dst_m0; ++m)
+ {
+ std::vector<int32_t> idx{m};
+ it.push_back({idx});
+ }
+
+ const auto &const_idxs = writer->declare_constant_tile(ckw::ConstantData(it, ckw::DataType::Int32));
+
+ auto tile_xi = writer->declare_tile("xi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_yi = writer->declare_tile("yi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+ // Convert the linear index to coordinate
+ // xi = ((mout + i) % dst_w) * stride_x - pad_x
+ // yi = ((mout + i) / dst_w) * stride_y - pad_y
+ writer->op_binary(tile_xi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Add, tile_mout, const_idxs);
+ writer->op_binary(tile_xi, ckw::BinaryOp::Mod, tile_xi, const_dst_w_i32);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Div, tile_yi, const_dst_w_i32);
+ writer->op_binary(tile_xi, ckw::BinaryOp::Mul, tile_xi, const_stride_x_i32);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Mul, tile_yi, const_stride_y_i32);
+ writer->op_binary(tile_xi, ckw::BinaryOp::Sub, tile_xi, const_pad_x_i32);
+ writer->op_binary(tile_yi, ckw::BinaryOp::Sub, tile_yi, const_pad_y_i32);
+
+ auto tile_y_b = writer->declare_tile("y_b", ckw::TileInfo(ckw::DataType::Int32));
+ writer->op_binary(tile_y_b, ckw::BinaryOp::Mul, tile_cout, const_kernel_size_i32);
+
+ auto tile_i = writer->declare_tile("i", ckw::TileInfo(ckw::DataType::Int32));
+ writer->op_assign(tile_i, const_0_i32);
+
+ // clang-format off
+ writer->op_for_loop(tile_i, ckw::BinaryOp::Less, const_kernel_size_i32, tile_i, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+ {
+ auto tile_x_k = writer->declare_tile("x_k", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_y_k = writer->declare_tile("y_k", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_binary(tile_x_k, ckw::BinaryOp::Mod, tile_i, const_kernel_w_i32);
+ writer->op_binary(tile_y_k, ckw::BinaryOp::Div, tile_i, const_kernel_w_i32);
+
+ auto tile_ck = writer->declare_tile("ck", ckw::TileInfo(ckw::DataType::Int32));
+ writer->op_assign(tile_ck, const_0_i32);
+
+ // Construct an indirection buffer containing the precalculated addresses of elements in the source tensor
+ // x_s = xi + x_k
+ // y_s = yi + y_k
+ // mi = x_s + y_s * width;
+ // mi = select(-1, mi, x_s >= 0);
+ // mi = select(-1, mi, x_s < width);
+ // mi = select(-1, mi, y_s >= 0);
+ // mi = select(-1, mi, y_s < height);
+ auto tile_xs = writer->declare_tile("xs", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_ys = writer->declare_tile("ys", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_mi = writer->declare_tile("mi", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+ auto tile_xs_gte_0 = writer->declare_tile("xs_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_ys_gte_0 = writer->declare_tile("ys_gte_0", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_xs_lt_w = writer->declare_tile("xs_lt_w", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+ auto tile_ys_lt_h = writer->declare_tile("ys_lt_h", ckw::TileInfo(ckw::DataType::Int32, dst_m0, 1));
+
+ writer->op_binary(tile_xs, ckw::BinaryOp::Add, tile_xi, tile_x_k);
+ writer->op_binary(tile_ys, ckw::BinaryOp::Add, tile_yi, tile_y_k);
+ writer->op_binary(tile_mi, ckw::BinaryOp::Mul, tile_ys, const_src_w_i32);
+ writer->op_binary(tile_mi, ckw::BinaryOp::Add, tile_mi, tile_xs);
+ writer->op_binary(tile_xs_gte_0, ckw::BinaryOp::GreaterEqual, tile_xs, const_0_i32);
+ writer->op_binary(tile_ys_gte_0, ckw::BinaryOp::GreaterEqual, tile_ys, const_0_i32);
+ writer->op_binary(tile_xs_lt_w, ckw::BinaryOp::Less, tile_xs, const_src_w_i32);
+ writer->op_binary(tile_ys_lt_h, ckw::BinaryOp::Less, tile_ys, const_src_h_i32);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_gte_0);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_gte_0);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_xs_lt_w);
+ writer->op_ternary(tile_mi, ckw::TernaryOp::Select, const_neg_1_i32, tile_mi, tile_ys_lt_h);
+
+ writer->op_for_loop(tile_ck, ckw::BinaryOp::LessEqual, const_src_c_i32_minus_k0_i32, tile_ck, ckw::AssignmentOp::Increment, const_k0_i32, [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, k0));
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, k0));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+ writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+ });
+
+ // Left-over accumulations for when K is not a multiple of k0
+ if(((src_c % k0) != 0))
+ {
+ writer->op_for_loop(tile_ck, ckw::BinaryOp::Less, const_src_c_i32, tile_ck, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs_leftover", ckw::TileInfo(to_ckw(_src->data_type()), dst_m0, 1));
+ auto tile_rhs = writer->declare_tile("rhs_leftover", ckw::TileInfo(to_ckw(_wei->data_type()), dst_n0, 1));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load_indirect(tile_lhs, src->tensor(), sampler_src, tile_ck, tile_mi, const_0_i32, tile_bout);
+ writer->op_load_dilated(tile_rhs, wei->tensor(), sampler_wei, tile_ck, tile_y_b, const_0_i32, const_0_i32, const_pos_1_i32, const_kernel_size_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+ });
+ }
+
+ writer->op_binary(tile_y_b, ckw::BinaryOp::Add, tile_y_b, const_pos_1_i32);
+ });
+ // clang-format on
+
+ // NOTE: The bias addition will be removed from this kernel as the interface is standardized. The intended way of
+ // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
+ if (using_bias)
+ {
+ if (!bia->has_tile())
+ {
+ auto tile_bia = writer->declare_tile("bia", ckw::TileInfo(to_ckw(_src->data_type()), 1, dst_n0));
+ writer->op_load(tile_bia, bia->tensor(), sampler_bia, tile_cout, const_0_i32, const_0_i32, const_0_i32);
+ bia->init_virtual_tensor(tile_bia, sampler_bia);
+ }
+ auto &tile_bia = bia->tile();
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_bia);
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwDirectConv2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const auto dst_shape = _dst->tensor_shape();
+ const auto desc = _settings.direct_conv_descriptor();
+
+ const uint32_t dst_n0 = adjust_vec_size(desc.n0, dst_shape[0]);
+ const uint32_t dst_m0 = adjust_vec_size(desc.m0, dst_shape[1] * dst_shape[2]);
+
+ Window win = calculate_max_window(dst_shape, Steps(dst_n0, dst_m0));
+
+ const size_t dim_y_collapsed = ceil_to_multiple(dst_shape[1] * dst_shape[2], dst_m0);
+ win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, dst_m0));
+ win.set(Window::DimZ, Window::Dimension(0, dst_shape.total_size_upper(3), 1));
+
+ return win;
+}
+
+std::string GpuCkwDirectConv2d::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ return "direct_conv2d";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
new file mode 100644
index 0000000000..139cf620e2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwDirectConv2d : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentDirectConv2d::Attributes;
+ using Settings = ClComponentDirectConv2d::Settings;
+
+public:
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentDirectConv2d::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+ * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component
+ */
+ GpuCkwDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwDirectConv2d);
+ /** Destructor */
+ ~GpuCkwDirectConv2d() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_wei;
+ const ITensorInfo *_bia;
+ const ITensorInfo *_dst;
+
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
new file mode 100644
index 0000000000..fb55acad53
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwElementwiseBinary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include "compute_kernel_writer/include/ckw/types/ConstantData.h"
+#include "compute_kernel_writer/include/ckw/types/TensorSamplerTypes.h"
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes)
+ : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
+{
+ _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+ GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(1));
+
+ // CKW constants
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The compute block parameters depend on the employed tensor format
+
+ // Destination compute block size
+ int32_t dst_n0 = -1;
+ int32_t dst_m0 = -1;
+
+ // Destination compute block size left-over
+ int32_t dst_n0_partial = -1;
+ int32_t dst_m0_partial = -1;
+
+ if (!dst->has_tile())
+ {
+ // If ROOT component, we use ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1
+ // as tensor format
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ dst_n0 = root_window.x().step();
+ dst_m0 = root_window.y().step();
+ dst_n0_partial = _dst->dimension(0) % dst_n0;
+ dst_m0_partial = (_dst->dimension(1) * _dst->dimension(2)) % dst_m0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ ckw::DataType dst_dt = to_ckw(_dst->data_type());
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+ }
+ else
+ {
+ // Change dst_n0 and dst_m0 if NOT root component!
+ dst_n0 = dst->tile().tile_info().width();
+ dst_m0 = dst->tile().tile_info().height();
+
+ // Here, it is not required the calculation of dst_n0_partial and dst_m0_partial
+ // because if we enter this condition it means that the element-wise op is not the
+ // root component and the address modes have been already set.
+ }
+
+ const auto &tile_dst = dst->tile();
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // ...
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensors
+ ********************************************************************************/
+ // Check whether the lhs tensor is a tile or tensor
+ // If it is a tile, create a sampler and load the content in a tile
+ if (!lhs->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_lhs = dst->tensor_sampler();
+
+ bool broadcast_x = false;
+ bool broadcast_y = false;
+
+ int32_t lhs_n0 = dst_n0;
+ int32_t lhs_m0 = dst_m0;
+
+ // Check whether we have broadcasting
+ // In case of broadcast, lhs can only be a vector or scalar.
+ // Broadcasting in other dimensions is not supported
+ if (_dst->dimension(0) != _lhs->dimension(0))
+ {
+ broadcast_x = true;
+ lhs_n0 = 1;
+ }
+
+ if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ if (_dst->dimension(1) * _dst->dimension(2) != _lhs->dimension(1) * _lhs->dimension(2))
+ {
+ broadcast_y = true;
+ lhs_m0 = 1;
+ }
+ }
+ else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ if (_dst->dimension(1) != _lhs->dimension(1))
+ {
+ broadcast_y = true;
+ lhs_m0 = 1;
+ }
+ }
+
+ const int32_t lhs_partial_n0 = _lhs->dimension(0) % lhs_n0;
+ const int32_t lhs_shift_back = (lhs_n0 - lhs_partial_n0) % lhs_n0;
+
+ // Constants
+ auto const_lhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_n0}}, ckw::DataType::Int32));
+ auto const_lhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{lhs_m0}}, ckw::DataType::Int32));
+ auto const_lhs_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{lhs_shift_back}}, ckw::DataType::Int32));
+
+ auto tile_gid_0 = writer->declare_tile("gid_0_lhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1_lhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2_lhs", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 =
+ writer->declare_tile("mout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1_lhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0_lhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ if (!broadcast_x)
+ {
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_lhs_n0_i32,
+ const_lhs_shift_back_n0_i32, const_0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_cout0, const_0_i32);
+ }
+
+ if (!broadcast_y)
+ {
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_lhs_m0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_mout0, const_0_i32);
+ }
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_lhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ // For tile_mout1 and tile_bout0 the step can only be 1
+ if (!broadcast_y)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ }
+ else
+ {
+ // If broadcast_y == true, it means that we have either a scalar or vector
+ // because broadcasting in other dimensions is not supported
+ writer->op_assign(tile_mout1, const_0_i32);
+ }
+
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+
+ ckw::DataType lhs_dt = to_ckw(_lhs->data_type());
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(lhs_dt, lhs_m0, lhs_n0));
+
+ writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() is used to bring the tile_lhs outside the compound statement
+ lhs->init_virtual_tensor(tile_lhs, sampler_lhs);
+ }
+
+ // Check whether the rhs tensor is a tile or tensor
+ // If it is a tile, create a sampler and load the content in a tile
+ if (!rhs->has_tile())
+ {
+ // Sampler
+ ckw::TensorSampler sampler_rhs = dst->tensor_sampler();
+
+ bool broadcast_x = false;
+ bool broadcast_y = false;
+
+ int32_t rhs_n0 = dst_n0;
+ int32_t rhs_m0 = dst_m0;
+
+ // Check whether we have broadcasting
+ // In case of broadcast, rhs can only be a vector or scalar.
+ // Broadcasting in other dimensions is not supported
+ if (_dst->dimension(0) != _rhs->dimension(0))
+ {
+ broadcast_x = true;
+ rhs_n0 = 1;
+ }
+
+ if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ if (_dst->dimension(1) * _dst->dimension(2) != _rhs->dimension(1) * _rhs->dimension(2))
+ {
+ broadcast_y = true;
+ rhs_m0 = 1;
+ }
+ }
+ else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ if (_dst->dimension(1) != _rhs->dimension(1))
+ {
+ broadcast_y = true;
+ rhs_m0 = 1;
+ }
+ }
+
+ const int32_t rhs_partial_n0 = _rhs->dimension(0) % rhs_n0;
+ const int32_t rhs_shift_back = (rhs_n0 - rhs_partial_n0) % rhs_n0;
+
+ // Constants
+ auto const_rhs_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_n0}}, ckw::DataType::Int32));
+ auto const_rhs_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{rhs_m0}}, ckw::DataType::Int32));
+ auto const_rhs_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{rhs_shift_back}}, ckw::DataType::Int32));
+
+ auto tile_gid_0 = writer->declare_tile("gid_0_rhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1_rhs", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2_rhs", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 =
+ writer->declare_tile("mout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1_rhs", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0_rhs", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ if (!broadcast_x)
+ {
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_rhs_n0_i32,
+ const_rhs_shift_back_n0_i32, const_0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_cout0, const_0_i32);
+ }
+
+ if (!broadcast_y)
+ {
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_rhs_m0_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_mout0, const_0_i32);
+ }
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_rhs.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ // For tile_mout1 and tile_bout0 the step can only be 1
+ const auto src_w = static_cast<int32_t>(_rhs->dimension(1));
+ auto const_src_w = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ if (!broadcast_y)
+ {
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_mout1, const_src_w);
+ }
+ else
+ {
+ // If broadcast_y == true, it means that we have either a scalar or vector
+ // because broadcasting in other dimensions is not supported
+ writer->op_assign(tile_mout1, const_0_i32);
+ }
+
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_mout1, const_src_w);
+ }
+
+ ckw::DataType rhs_dt = to_ckw(_rhs->data_type());
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(rhs_dt, rhs_m0, rhs_n0));
+
+ writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_cout0, tile_mout0, tile_mout1, tile_bout0);
+
+ // Here, init_virtual_tensor() is used to bring the tile_rhs outside the compound statement
+ rhs->init_virtual_tensor(tile_rhs, sampler_rhs);
+ }
+
+ const auto &tile_lhs = lhs->tile();
+ const auto &tile_rhs = rhs->tile();
+
+ /********************************************************************************
+ * 7 - Write the rest of the code
+ ********************************************************************************/
+ // Perform the element-wise operation
+ writer->op_binary(tile_dst, to_ckw(_attributes), tile_lhs, tile_rhs);
+
+ ARM_COMPUTE_ERROR_ON_MSG(dst->has_tile() == false, "You must bind a tile before appending another component");
+}
+
+Window GpuCkwElementwiseBinary::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
+ // This is in line with the collapsing convention used by operators like Conv2d
+ output_shape.collapse(2U, 1U);
+ constexpr uint32_t vector_size_byte_opencl = 16;
+ const uint32_t num_elems_processed_per_iteration =
+ adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+ Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+
+ return win;
+}
+
+std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+ const std::vector<std::string> build_params = {
+ "elementwise_binary",
+ "op",
+ to_string(_attributes.operation()),
+ "dt",
+ lower_string(string_from_data_type(_dst->data_type())),
+ };
+ return join(build_params, "_");
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
new file mode 100644
index 0000000000..c6cbba28d3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwElementwiseBinary : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentElementwiseBinary::Attributes;
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentElementwiseBinary::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary);
+ /** Destructor */
+ ~GpuCkwElementwiseBinary() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_lhs;
+ const ITensorInfo *_rhs;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
new file mode 100644
index 0000000000..14ad3847fc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.cpp
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "support/StringSupport.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+
+GpuCkwMatMul::GpuCkwMatMul(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}, _settings{settings}
+{
+ _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
+}
+
+void GpuCkwMatMul::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, "lhs");
+ GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, "rhs");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto k =
+ _attributes.adj_lhs() ? static_cast<int32_t>(_lhs->dimension(1)) : static_cast<int32_t>(_lhs->dimension(0));
+ const auto k0 = static_cast<int32_t>(adjust_vec_size(_settings.k0(), k));
+ const auto dst_dt = to_ckw(_dst->data_type());
+
+ // CKW constants
+ auto const_k_i32 = writer->declare_constant_tile(ckw::ConstantData({{k}}, ckw::DataType::Int32));
+ auto const_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k0}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_k_minus_k0_i32 = writer->declare_constant_tile(ckw::ConstantData({{k - k0}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensors
+ ********************************************************************************/
+ // LHS SAMPLER
+ // The assumption here is that M is multiple of M0. This limitation will be removed once
+ // we have the support for OverlappingMin as address mode for the Y direction
+ ckw::TensorSampler sampler_lhs;
+ sampler_lhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_lhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_lhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_lhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_lhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // RHS SAMPLER
+ ckw::TensorSampler sampler_rhs;
+ sampler_rhs.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_rhs.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_rhs.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_rhs.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_rhs.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code (optional)
+ ********************************************************************************/
+
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_idx_n = writer->declare_tile("idx_n", ckw::TileInfo(ckw::DataType::Int32)); // N index
+ auto tile_idx_m = writer->declare_tile("idx_m", ckw::TileInfo(ckw::DataType::Int32)); // M index
+ auto tile_idx_b = writer->declare_tile("idx_b", ckw::TileInfo(ckw::DataType::Int32)); // BATCH index
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_idx_n, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_idx_m, tile_gid_1, const_dst_m0_i32);
+ get_coordinate_from_gws(writer, tile_idx_b, tile_gid_2, const_pos_1_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ auto tile_idx_k = writer->declare_tile("idx_k", ckw::TileInfo(ckw::DataType::Int32)); // K index
+
+ writer->op_assign(tile_idx_k, const_0_i32);
+
+ // clang-format off
+ writer->op_for_loop(tile_idx_k, ckw::BinaryOp::LessEqual, const_k_minus_k0_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_k0_i32,
+ [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, k0));
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, k0));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+ writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+
+ });
+
+ // Left-over accumulations for when K is not a multiple of k0
+ if(((k % k0) != 0))
+ {
+ writer->op_for_loop(tile_idx_k, ckw::BinaryOp::Less, const_k_i32, tile_idx_k, ckw::AssignmentOp::Increment, const_pos_1_i32, [&]()
+ {
+ auto tile_lhs = writer->declare_tile("lhs", ckw::TileInfo(to_ckw(_lhs->data_type()), dst_m0, 1));
+ auto tile_rhs = writer->declare_tile("rhs", ckw::TileInfo(to_ckw(_rhs->data_type()), dst_n0, 1));
+ writer->op_assign(tile_lhs, const_0_fp);
+ writer->op_assign(tile_rhs, const_0_fp);
+
+ writer->op_load(tile_lhs, lhs->tensor(), sampler_lhs, tile_idx_k, tile_idx_m, tile_idx_b, const_0_i32);
+ writer->op_load(tile_rhs, rhs->tensor(), sampler_rhs, tile_idx_k, tile_idx_n, tile_idx_b, const_0_i32);
+
+ writer->op_binary(tile_dst, ckw::BinaryOp::MatMul_Nt_T, tile_lhs, tile_rhs);
+ });
+ }
+ // clang-format on
+}
+
+Window GpuCkwMatMul::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const int32_t m = _dst->dimension(1);
+ const int32_t n = _dst->dimension(0);
+ const bool adj_lhs = _attributes.adj_lhs();
+
+ const int32_t m0 = adj_lhs ? adjust_vec_size(_settings.m0(), m) : std::min(_settings.m0(), m);
+ const int32_t n0 = adjust_vec_size(_settings.n0(), n);
+
+ // Configure kernel window
+ Window win = calculate_max_window(_dst->tensor_shape(), Steps(n0, m0));
+ win = win.collapse(win, Window::DimZ);
+
+ return win;
+}
+
+std::string GpuCkwMatMul::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ std::string kernel_name("mat_mul_native");
+
+ const int32_t m = _dst->dimension(1);
+ const int32_t n = _dst->dimension(0);
+ const int32_t k = _attributes.adj_lhs() ? _lhs->tensor_shape().y() : _lhs->tensor_shape().x();
+
+ kernel_name += _attributes.adj_lhs() ? "_t" : "_nt";
+ kernel_name += _attributes.adj_rhs() ? "_t" : "_nt";
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(m);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(n);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(k);
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_dst->dimension(2));
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.m0());
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.n0());
+ kernel_name += "_";
+ kernel_name += support::cpp11::to_string(_settings.k0());
+
+ return kernel_name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
new file mode 100644
index 0000000000..790418bf50
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwMatMul final : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentMatMul::Attributes;
+ using Settings = ClComponentMatMul::Settings;
+
+public:
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentMatMul::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes. Attributes are a set of parameters that define what a component does
+ * @param[in] settings Component settings. Settings are a set of parameters that influence the implementation of a component
+ */
+ GpuCkwMatMul(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwMatMul);
+
+ /** Destructor */
+ ~GpuCkwMatMul() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_lhs;
+ const ITensorInfo *_rhs;
+ const ITensorInfo *_dst;
+
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
new file mode 100644
index 0000000000..d027f348ef
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+#include "compute_kernel_writer/include/ckw/KernelWriter.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwPool2d::GpuCkwPool2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
+
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const uint32_t width_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const auto pool_sz_x = static_cast<int32_t>(_attributes.pool_size().x());
+ const auto pool_sz_y = static_cast<int32_t>(_attributes.pool_size().y());
+ const auto pad_x = static_cast<int32_t>(_attributes.pad().left);
+ const auto pad_y = static_cast<int32_t>(_attributes.pad().top);
+ const auto stride_x = static_cast<int32_t>(_attributes.stride().x());
+ const auto stride_y = static_cast<int32_t>(_attributes.stride().y());
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+
+ // CKW constants
+ auto const_pool_sz_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_x}}, ckw::DataType::Int32));
+ auto const_pool_sz_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pool_sz_y}}, ckw::DataType::Int32));
+ auto const_pad_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_x}}, ckw::DataType::Int32));
+ auto const_pad_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{pad_y}}, ckw::DataType::Int32));
+ auto const_stride_x_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_x}}, ckw::DataType::Int32));
+ auto const_stride_y_i32 = writer->declare_constant_tile(ckw::ConstantData({{stride_y}}, ckw::DataType::Int32));
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_lowest_val_fp =
+ writer->declare_constant_tile(ckw::ConstantData({{std::numeric_limits<float>::lowest()}}, ckw::DataType::Fp32));
+ auto const_neg_inf_val_fp = writer->declare_constant_tile(ckw::ConstantData({{-1.0f / 0.0f}}, ckw::DataType::Fp32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+ const int32_t dst_m0 = root_window.y().step();
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_m0_partial = _dst->dimension(1) % dst_m0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+
+ if (dst_m0_partial == 0)
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::ClampToBorderMaxOnly);
+ }
+
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, dst_m0, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ // Only now we can declare the N0 and M0 as constant
+ auto const_dst_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_dst_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_dst_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+ // Check if it is global pooling
+ const bool is_global_pooling = (pool_sz_x == src_w) && (pool_sz_y == src_h) && (pad_x == 0) && (pad_y == 0);
+
+ // Accumulate always in F32 if the pool type is not MAX
+ const bool acc_f32 = (dst_dt == ckw::DataType::Fp32) ||
+ ((dst_dt == ckw::DataType::Fp16) && _attributes.pool_type() != PoolingType::MAX);
+
+ const auto acc_dt = acc_f32 ? ckw::DataType::Fp32 : ckw::DataType::Fp16;
+
+ const bool is_wider_acc = dst_dt != acc_dt;
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_cout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_cout0, tile_gid_0, const_dst_n0_i32,
+ const_shift_back_dst_n0_i32, const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_dst_m0_i32);
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ // A tile used to temporarily store results or as an accumulator in case of AVG and L2 pooling.
+ auto tile_res = writer->declare_tile("tile_res", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
+
+ // Initialise result tile with appropriate value
+ if (_attributes.pool_type() == PoolingType::MAX)
+ {
+ if (_settings.use_inf_as_limit())
+ {
+ writer->op_cast(tile_res, const_neg_inf_val_fp, ckw::ConvertPolicy::None);
+ }
+ else
+ {
+ writer->op_cast(tile_res, const_lowest_val_fp, ckw::ConvertPolicy::None);
+ }
+ }
+ else
+ {
+ writer->op_cast(tile_res, const_0_fp, ckw::ConvertPolicy::None);
+ }
+
+ // tile_idx_in_w = tile_mout0 * STRIDE_X - PAD_X
+ auto tile_src_coord_x_start = writer->declare_tile("idx_in_w", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Mul, tile_mout0, const_stride_x_i32);
+ writer->op_binary(tile_src_coord_x_start, ckw::BinaryOp::Sub, tile_src_coord_x_start, const_pad_x_i32);
+
+ // tile_idx_in_h = tile_mout1 * STRIDE_Y - PAD_Y
+ auto tile_src_coord_y_start = writer->declare_tile("idx_in_h", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Mul, tile_mout1, const_stride_y_i32);
+ writer->op_binary(tile_src_coord_y_start, ckw::BinaryOp::Sub, tile_src_coord_y_start, const_pad_y_i32);
+
+ auto tile_neg_src_coord_x_start = writer->declare_tile("neg_src_coord_x_start", ckw::DataType::Int32);
+ auto tile_neg_src_coord_y_start = writer->declare_tile("neg_src_coord_y_start", ckw::DataType::Int32);
+
+ writer->op_binary(tile_neg_src_coord_x_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_x_start);
+ writer->op_binary(tile_neg_src_coord_y_start, ckw::BinaryOp::Sub, const_0_i32, tile_src_coord_y_start);
+
+ // int pool_x_s = max((int)0, -idx_in_w);
+ // int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+ // int pool_y_s = max((int)0, -idx_in_h);
+ // int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+ auto tile_pool_x_s = writer->declare_tile("pool_x_s", ckw::DataType::Int32);
+ auto tile_pool_y_s = writer->declare_tile("pool_y_s", ckw::DataType::Int32);
+ auto tile_pool_x_e = writer->declare_tile("pool_x_e", ckw::DataType::Int32);
+ auto tile_pool_y_e = writer->declare_tile("pool_y_e", ckw::DataType::Int32);
+
+ writer->op_binary(tile_pool_x_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_x_start);
+ writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Add, const_src_w_i32, tile_neg_src_coord_x_start);
+ writer->op_binary(tile_pool_x_e, ckw::BinaryOp::Min, const_pool_sz_x_i32, tile_pool_x_e);
+ writer->op_binary(tile_pool_y_s, ckw::BinaryOp::Max, const_0_i32, tile_neg_src_coord_y_start);
+ writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Add, const_src_h_i32, tile_neg_src_coord_y_start);
+ writer->op_binary(tile_pool_y_e, ckw::BinaryOp::Min, const_pool_sz_y_i32, tile_pool_y_e);
+
+ // #if defined(EXCLUDE_PADDING)
+ // int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+ // #else // defined(EXCLUDE_PADDING)
+ // int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+ // #endif // defined(EXCLUDE_PADDING)
+ auto tile_filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
+ if (_attributes.exclude_padding())
+ {
+ auto tile_x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
+ auto tile_y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
+
+ writer->op_binary(tile_x_diff, ckw::BinaryOp::Sub, tile_pool_x_e, tile_pool_x_s);
+ writer->op_binary(tile_y_diff, ckw::BinaryOp::Sub, tile_pool_y_e, tile_pool_y_s);
+ writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, tile_x_diff, tile_y_diff);
+ }
+ else
+ {
+ writer->op_binary(tile_filter_size, ckw::BinaryOp::Mul, const_pool_sz_x_i32, const_pool_sz_y_i32);
+ }
+
+ auto tile_x = writer->declare_tile("x", ckw::DataType::Int32);
+ auto tile_y = writer->declare_tile("y", ckw::DataType::Int32);
+
+ if (is_global_pooling)
+ {
+ writer->op_assign(tile_y, const_0_i32);
+ writer->op_assign(tile_pool_y_e, const_pool_sz_y_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_y, tile_pool_y_s);
+ }
+
+ // Y dim for-loop
+ writer->op_for_loop(
+ tile_y, ckw::BinaryOp::Less, tile_pool_y_e, tile_y, ckw::AssignmentOp::Increment, const_pos_1_i32,
+ [&]()
+ {
+ // Reset the iterator for the inner loop
+ if (is_global_pooling)
+ {
+ writer->op_assign(tile_x, const_0_i32);
+ writer->op_assign(tile_pool_x_e, const_pool_sz_x_i32);
+ }
+ else
+ {
+ writer->op_assign(tile_x, tile_pool_x_s);
+ }
+
+ auto tile_src_coord_y = writer->declare_tile("src_coord_y", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_y, ckw::BinaryOp::Add, tile_src_coord_y_start, tile_y);
+
+ // X dim for-loop
+ writer->op_for_loop(
+ tile_x, ckw::BinaryOp::Less, tile_pool_x_e, tile_x, ckw::AssignmentOp::Increment, const_pos_1_i32,
+ [&]()
+ {
+ auto tile_src_coord_x = writer->declare_tile("src_coord_x", ckw::DataType::Int32);
+ writer->op_binary(tile_src_coord_x, ckw::BinaryOp::Add, tile_src_coord_x_start, tile_x);
+
+ ckw::DataType src_dt = to_ckw(_src->data_type());
+ auto tile_src = writer->declare_tile("tile_src", ckw::TileInfo(acc_dt, dst_m0, dst_n0));
+
+ // Load src tile
+ if (is_wider_acc)
+ {
+ auto tile_src0 = writer->declare_tile("src_tile0", ckw::TileInfo(src_dt, dst_m0, dst_n0));
+ writer->op_load(tile_src0, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+ tile_src_coord_y, tile_bout0);
+ writer->op_cast(tile_src, tile_src0, ckw::ConvertPolicy::None);
+ }
+ else
+ {
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_cout0, tile_src_coord_x,
+ tile_src_coord_y, tile_bout0);
+ }
+
+ // Take the square of the input, for L2 Pooling
+ if (_attributes.pool_type() == PoolingType::L2)
+ {
+ writer->op_binary(tile_src, ckw::BinaryOp::Mul, tile_src, tile_src);
+ }
+
+ // Perfom Pooling op
+ if (_attributes.pool_type() == PoolingType::MAX)
+ {
+ writer->op_binary(tile_res, ckw::BinaryOp::Max, tile_res, tile_src);
+ }
+ else
+ {
+ writer->op_binary(tile_res, ckw::BinaryOp::Add, tile_res, tile_src);
+ }
+ });
+ });
+
+ if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
+ {
+ // Filter_size is automatically broadcasted in the operation
+ auto tile_filter_size_fp = writer->declare_tile("filter_size_fp", ckw::TileInfo(acc_dt));
+ writer->op_cast(tile_filter_size_fp, tile_filter_size, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_res, ckw::BinaryOp::Div, tile_res, tile_filter_size_fp);
+ }
+
+ // Take square root of the result in L2 pooling
+ if (_attributes.pool_type() == PoolingType::L2)
+ {
+ writer->op_unary(tile_res, ckw::UnaryOp::Sqrt, tile_res);
+ }
+
+ // Store the results and do casting if mixed precision
+ if (is_wider_acc)
+ {
+ writer->op_cast(tile_dst, tile_res, ckw::ConvertPolicy::None);
+ }
+ else
+ {
+ writer->op_assign(tile_dst, tile_res);
+ }
+}
+
+Window GpuCkwPool2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ TensorShape output_shape = _dst->tensor_shape();
+ const uint32_t vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+ // Create and configure kernel window
+ auto win = calculate_max_window(output_shape, Steps(vec_size));
+ win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.
+ return win;
+}
+
+std::string GpuCkwPool2d::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ return "pool2dMxN";
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
new file mode 100644
index 0000000000..822282a108
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwPool2d : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentPool2d::Attributes;
+ using Settings = ClComponentPool2d::Settings;
+
+ /** Constructor
+ *
+ * For supported configurations please refer to @ref ClComponentCast::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ */
+ GpuCkwPool2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwPool2d);
+ /** Destructor */
+ ~GpuCkwPool2d() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
new file mode 100644
index 0000000000..edd7ea9a38
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
@@ -0,0 +1,576 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "support/StringSupport.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr uint32_t opencl_vector_size_in_bytes = 16;
+} // namespace
+
+GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
+}
+
+void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const uint32_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+ const uint32_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+ _attributes.align_corners());
+ const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+ _attributes.align_corners());
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+
+ // CKW constants
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+ auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+ auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+
+ // dst_m0 must be 1
+ ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensor
+ ********************************************************************************/
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+
+ // ....
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+ const_0_i32);
+ writer->op_assign(tile_xo, tile_gid_1);
+ writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+ auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+ switch (_attributes.sampling_policy())
+ {
+ case SamplingPolicy::TOP_LEFT:
+ // xi_f = (xo * scale_x)
+ // yi_f = (yo * scale_y)
+ writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
+ break;
+ case SamplingPolicy::CENTER:
+ {
+ // xi_f = ((xo + 0.5f) * scale_x)
+ // yi_f = ((yo + 0.5f) * scale_y)
+ const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+ const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+ writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported sampling policy");
+ }
+
+ if (_attributes.align_corners())
+ {
+ writer->op_unary(tile_xi_f, ckw::UnaryOp::Round, tile_xi_f);
+ writer->op_unary(tile_yi_f, ckw::UnaryOp::Round, tile_yi_f);
+ }
+
+ // xi0 = clamp((int)xi_f, 0, (int)src_w - 1)
+ // yi0 = clamp((int)yi_f, 0, (int)src_h - 1)
+ auto tile_xi_f_int = writer->declare_tile("xi_f_int", ckw::DataType::Int32);
+ auto tile_yi_f_int = writer->declare_tile("yi_f_int", ckw::DataType::Int32);
+
+ writer->op_cast(tile_xi_f_int, tile_xi_f, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_f_int, tile_yi_f, ckw::ConvertPolicy::None);
+
+ auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+ auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+
+ writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+ writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+ auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+ auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+
+ writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi_f_int, const_0_i32, tile_src_w_minus_1);
+ writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi_f_int, const_0_i32, tile_src_h_minus_1);
+
+ auto tile_src = writer->declare_tile("src_tile", ckw::TileInfo(dst_dt, 1, dst_n0));
+ writer->op_load(tile_src, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+
+ writer->op_assign(tile_dst, tile_src);
+}
+
+void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
+
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_dt = to_ckw(_dst->data_type());
+ const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+ _attributes.align_corners());
+ const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+ _attributes.align_corners());
+ const auto src_w = static_cast<int32_t>(_src->dimension(width_idx));
+ const auto src_h = static_cast<int32_t>(_src->dimension(height_idx));
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(height_idx));
+
+ // CKW constants
+ auto const_src_w_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_w}}, ckw::DataType::Int32));
+ auto const_src_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{src_h}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_0_fp = writer->declare_constant_tile(ckw::ConstantData({{0.0f}}, dst_dt));
+ auto const_pos_1_fp = writer->declare_constant_tile(ckw::ConstantData({{1.0f}}, ckw::DataType::Fp32));
+ auto const_pos_0_5_fp = writer->declare_constant_tile(ckw::ConstantData({{0.5f}}, ckw::DataType::Fp32));
+ auto const_scale_x_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_x}}, ckw::DataType::Fp32));
+ auto const_scale_y_fp = writer->declare_constant_tile(ckw::ConstantData({{scale_y}}, ckw::DataType::Fp32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ // The n0 and m0 parameters from root_window only refers to the output
+ const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
+
+ // Destination compute block size
+ const int32_t dst_n0 = root_window.x().step();
+
+ // dst_m0 must be 1
+ ARM_COMPUTE_ERROR_ON(root_window.y().step() != 1);
+
+ // Destination compute block size left-over
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+
+ // Shift-back for the overlapping-min strategy
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ ckw::TensorSampler sampler_dst;
+ sampler_dst.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ if (dst_n0_partial == 0)
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ }
+ else
+ {
+ sampler_dst.address_mode_x(ckw::TensorSamplerAddressModeX::OverlappingMin);
+ }
+ sampler_dst.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_dst.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+ sampler_dst.storage(ckw::TensorStorageType::BufferUint8Ptr);
+
+ // Declare destination tile
+ auto tile_dst = writer->declare_tile("dst", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+ // Initialize destination tile
+ writer->op_assign(tile_dst, const_0_fp);
+
+ // Bind tile to the tensor
+ dst->init_virtual_tensor(tile_dst, sampler_dst);
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the sampler for the input tensor
+ ********************************************************************************/
+ ckw::TensorSampler sampler_src;
+ sampler_src.format(ckw::TensorSamplerFormat::Dim0_Dim1_Dim2);
+ sampler_src.address_mode_x(ckw::TensorSamplerAddressModeX::None);
+ sampler_src.address_mode_y(ckw::TensorSamplerAddressModeY::None);
+ sampler_src.address_mode_z(ckw::TensorSamplerAddressModeZ::None);
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+
+ // ....
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_co = writer->declare_tile("co", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_xo = writer->declare_tile("xo", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH
+ auto tile_yo = writer->declare_tile("yo", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT
+ auto tile_bo = writer->declare_tile("bo", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_co, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+ const_0_i32);
+ writer->op_assign(tile_xo, tile_gid_1);
+ writer->op_binary(tile_yo, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bo, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ auto tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
+ auto tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
+
+ switch (_attributes.sampling_policy())
+ {
+ case SamplingPolicy::TOP_LEFT:
+ // xi_f = (xo * scale_x)
+ // yi_f = (yo * scale_y)
+ writer->op_cast(tile_xi_f, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_f, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xi_f, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yi_f, const_scale_y_fp);
+ break;
+ case SamplingPolicy::CENTER:
+ {
+ // xi_f = ((xo + 0.5f) * scale_x - 0.5f)
+ // yi_f = ((yo + 0.5f) * scale_y - 0.5f)
+ const auto &tile_xo_plus_half = writer->declare_tile("xo_plus_half", ckw::DataType::Fp32);
+ const auto &tile_yo_plus_half = writer->declare_tile("yo_plus_half", ckw::DataType::Fp32);
+
+ writer->op_cast(tile_xo_plus_half, tile_xo, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yo_plus_half, tile_yo, ckw::ConvertPolicy::None);
+ writer->op_binary(tile_xo_plus_half, ckw::BinaryOp::Add, tile_xo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_yo_plus_half, ckw::BinaryOp::Add, tile_yo_plus_half, const_pos_0_5_fp);
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Mul, tile_xo_plus_half, const_scale_x_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Mul, tile_yo_plus_half, const_scale_y_fp);
+
+ writer->op_binary(tile_xi_f, ckw::BinaryOp::Sub, tile_xi_f, const_pos_0_5_fp);
+ writer->op_binary(tile_yi_f, ckw::BinaryOp::Sub, tile_yi_f, const_pos_0_5_fp);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported sampling policy");
+ }
+
+ // xi = (int)floor(xi_f);
+ // yi = (int)floor(yi_f);
+ auto tile_xi_f_floor = writer->declare_tile("xi_f_floor", ckw::DataType::Fp32);
+ auto tile_yi_f_floor = writer->declare_tile("yi_f_floor", ckw::DataType::Fp32);
+ writer->op_unary(tile_xi_f_floor, ckw::UnaryOp::Floor, tile_xi_f);
+ writer->op_unary(tile_yi_f_floor, ckw::UnaryOp::Floor, tile_yi_f);
+
+ auto tile_xi = writer->declare_tile("xi", ckw::DataType::Int32);
+ auto tile_yi = writer->declare_tile("yi", ckw::DataType::Int32);
+ writer->op_cast(tile_xi, tile_xi_f_floor, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi, tile_yi_f_floor, ckw::ConvertPolicy::None);
+
+ // xi0 = clamp(xi, 0, (int)src_w - 1);
+ // yi0 = clamp(yi, 0, (int)src_h - 1);
+ // xi1 = clamp(xi + 1, 0, (int)src_w - 1);
+ // yi1 = clamp(yi + 1, 0, (int)src_h - 1);
+ auto tile_src_w_minus_1 = writer->declare_tile("src_w_minus_1", ckw::DataType::Int32);
+ auto tile_src_h_minus_1 = writer->declare_tile("src_h_minus_1", ckw::DataType::Int32);
+ writer->op_binary(tile_src_w_minus_1, ckw::BinaryOp::Sub, const_src_w_i32, const_pos_1_i32);
+ writer->op_binary(tile_src_h_minus_1, ckw::BinaryOp::Sub, const_src_h_i32, const_pos_1_i32);
+
+ auto tile_xi_plus_1 = writer->declare_tile("xi_plus_1", ckw::DataType::Int32);
+ auto tile_yi_plus_1 = writer->declare_tile("yi_plus_1", ckw::DataType::Int32);
+ writer->op_binary(tile_xi_plus_1, ckw::BinaryOp::Add, tile_xi, const_pos_1_i32);
+ writer->op_binary(tile_yi_plus_1, ckw::BinaryOp::Add, tile_yi, const_pos_1_i32);
+
+ auto tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
+ auto tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
+ auto tile_xi1 = writer->declare_tile("xi1", ckw::DataType::Int32);
+ auto tile_yi1 = writer->declare_tile("yi1", ckw::DataType::Int32);
+
+ writer->op_ternary(tile_xi0, ckw::TernaryOp::Clamp, tile_xi, const_0_i32, tile_src_w_minus_1);
+ writer->op_ternary(tile_yi0, ckw::TernaryOp::Clamp, tile_yi, const_0_i32, tile_src_h_minus_1);
+ writer->op_ternary(tile_xi1, ckw::TernaryOp::Clamp, tile_xi_plus_1, const_0_i32, tile_src_w_minus_1);
+ writer->op_ternary(tile_yi1, ckw::TernaryOp::Clamp, tile_yi_plus_1, const_0_i32, tile_src_h_minus_1);
+
+ auto tile_in00 = writer->declare_tile("in00", ckw::TileInfo(dst_dt, 1, dst_n0));
+ auto tile_in01 = writer->declare_tile("in01", ckw::TileInfo(dst_dt, 1, dst_n0));
+ auto tile_in10 = writer->declare_tile("in10", ckw::TileInfo(dst_dt, 1, dst_n0));
+ auto tile_in11 = writer->declare_tile("in11", ckw::TileInfo(dst_dt, 1, dst_n0));
+
+ writer->op_load(tile_in00, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi0, tile_bo);
+ writer->op_load(tile_in01, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi0, tile_bo);
+ writer->op_load(tile_in10, src->tensor(), sampler_src, tile_co, tile_xi0, tile_yi1, tile_bo);
+ writer->op_load(tile_in11, src->tensor(), sampler_src, tile_co, tile_xi1, tile_yi1, tile_bo);
+
+ // Weights of each nearest pixel
+ auto tile_a = writer->declare_tile("a", ckw::DataType::Fp32);
+ auto tile_b = writer->declare_tile("b", ckw::DataType::Fp32);
+ auto tile_a1 = writer->declare_tile("a1", ckw::DataType::Fp32);
+ auto tile_b1 = writer->declare_tile("b1", ckw::DataType::Fp32);
+
+ // a = (xi_f - (float)xi)
+ // b = (1.f - a)
+ // a1 = (yi_f - (float)yi)
+ // b1 = (1.f - a1)
+ auto tile_xi_float = writer->declare_tile("xi_float", ckw::DataType::Fp32);
+ auto tile_yi_float = writer->declare_tile("yi_float", ckw::DataType::Fp32);
+ writer->op_cast(tile_xi_float, tile_xi, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_yi_float, tile_yi, ckw::ConvertPolicy::None);
+
+ writer->op_binary(tile_a, ckw::BinaryOp::Sub, tile_xi_f, tile_xi_float);
+ writer->op_binary(tile_b, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a);
+ writer->op_binary(tile_a1, ckw::BinaryOp::Sub, tile_yi_f, tile_yi_float);
+ writer->op_binary(tile_b1, ckw::BinaryOp::Sub, const_pos_1_fp, tile_a1);
+
+ // Cast weights to source type
+ const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
+ const auto &tile_b_src_type = writer->declare_tile("b_src_t", to_ckw(_src->data_type()));
+ const auto &tile_a1_src_type = writer->declare_tile("a1_src_t", to_ckw(_src->data_type()));
+ const auto &tile_b1_src_type = writer->declare_tile("b1_src_t", to_ckw(_src->data_type()));
+
+ writer->op_cast(tile_a_src_type, tile_a, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_b_src_type, tile_b, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_a1_src_type, tile_a1, ckw::ConvertPolicy::None);
+ writer->op_cast(tile_b1_src_type, tile_b1, ckw::ConvertPolicy::None);
+
+ // in00 * b * b1
+ writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b_src_type);
+ writer->op_binary(tile_in00, ckw::BinaryOp::Mul, tile_in00, tile_b1_src_type);
+
+ // in01 * a * b1
+ writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_a_src_type);
+ writer->op_binary(tile_in01, ckw::BinaryOp::Mul, tile_in01, tile_b1_src_type);
+
+ // in10 * b * a1
+ writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_b_src_type);
+ writer->op_binary(tile_in10, ckw::BinaryOp::Mul, tile_in10, tile_a1_src_type);
+
+ // in11 * a * a1
+ writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a_src_type);
+ writer->op_binary(tile_in11, ckw::BinaryOp::Mul, tile_in11, tile_a1_src_type);
+
+ // Summation of above terms
+ writer->op_assign(tile_dst, tile_in00);
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in01);
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in10);
+ writer->op_binary(tile_dst, ckw::BinaryOp::Add, tile_dst, tile_in11);
+}
+
+void GpuCkwResize::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ switch (_attributes.interpolation_policy())
+ {
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ do_nearest_neighbor_resize(comp_group, vtable, writer);
+ break;
+ case InterpolationPolicy::BILINEAR:
+ do_bilinear_resize(comp_group, vtable, writer);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported interpolation policy");
+ }
+}
+
+Window GpuCkwResize::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const uint32_t n0 = adjust_vec_size(opencl_vector_size_in_bytes / _src->element_size(), _src->dimension(0));
+ Window win = calculate_max_window(*_dst, Steps(n0));
+ return win.collapse(win, Window::DimZ);
+}
+
+std::string GpuCkwResize::get_tuner_id(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ std::string tuner_id = "resize_";
+ tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+ tuner_id += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+ tuner_id += "_";
+ tuner_id += _attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft";
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(0));
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(1));
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(2));
+ tuner_id += "_";
+ tuner_id += support::cpp11::to_string(_dst->dimension(3));
+
+ return tuner_id;
+}
+
+std::string GpuCkwResize::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ std::string name = "resize_";
+ name += _attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "nearest_neighbor" : "";
+ name += _attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "bilinear" : "";
+
+ return name;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
new file mode 100644
index 0000000000..1266c05921
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwResize final : public IGpuCkwComponentDriver
+{
+public:
+ using Attributes = ClComponentResize::Attributes;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ * @param[in] attributes Component attributes
+ */
+ GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwResize);
+
+ /** Destructor */
+ ~GpuCkwResize() override = default;
+
+ // Inherited methods overriden
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ Window get_window() const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+ std::string get_tuner_id(const ComponentGroup &comp_group) const override;
+
+private:
+ /** Resize using nearest neighbor interpolation
+ *
+ * @param[in] comp_group Component group to which this component belongs to
+ * @param[in, out] vtable Table of variables declared by this component
+ * @param[in, out] writer CKW writer that writes code scoped to this kernel component
+ */
+ void do_nearest_neighbor_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const;
+
+ /** Resize using bilinear interpolation
+ *
+ * @param[in] comp_group Component group to which this component belongs to
+ * @param[in, out] vtable Table of variables declared by this component
+ * @param[in, out] writer CKW writer that writes code scoped to this kernel component
+ */
+ void do_bilinear_resize(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const;
+
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
new file mode 100644
index 0000000000..d9d741fea5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuCkwStore.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+
+#include <cstdint>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+ : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+void GpuCkwStore::write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const
+{
+ /********************************************************************************
+ * 1 - Define tensors
+ ********************************************************************************/
+ GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, "src");
+ GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, "dst");
+
+ /********************************************************************************
+ * 2 - Define CKW constants
+ ********************************************************************************/
+ const auto dst_h = static_cast<int32_t>(_dst->dimension(2));
+
+ auto const_0_i32 = writer->declare_constant_tile(ckw::ConstantData({{0}}, ckw::DataType::Int32));
+ auto const_pos_1_i32 = writer->declare_constant_tile(ckw::ConstantData({{1}}, ckw::DataType::Int32));
+ auto const_dst_h_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_h}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 3 - Define the compute block parameters and destination tile (if not root component)
+ * Bind the tile to the tensor to share it among different components and
+ * initialize the compute block parameters
+ ********************************************************************************/
+ const auto &tile_src = src->tile();
+ auto &sampler_src = src->tensor_sampler();
+
+ const auto dst_n0 = static_cast<int32_t>(tile_src.tile_info().width());
+ const auto dst_m0 = static_cast<int32_t>(tile_src.tile_info().height());
+ const int32_t dst_n0_partial = _dst->dimension(0) % dst_n0;
+ const int32_t dst_shift_back = (dst_n0 - dst_n0_partial) % dst_n0;
+
+ /********************************************************************************
+ * 4 - Define the compute block parameters CKW constants
+ ********************************************************************************/
+ auto const_n0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_n0}}, ckw::DataType::Int32));
+ auto const_m0_i32 = writer->declare_constant_tile(ckw::ConstantData({{dst_m0}}, ckw::DataType::Int32));
+ auto const_shift_back_n0_i32 =
+ writer->declare_constant_tile(ckw::ConstantData({{dst_shift_back}}, ckw::DataType::Int32));
+
+ /********************************************************************************
+ * 5 - Define the samplers for the input tensor
+ ********************************************************************************/
+ // Not required
+
+ /********************************************************************************
+ * 6 - Extra operations required before writing the main code
+ ********************************************************************************/
+ // Not required
+
+ /********************************************************************************
+ * 7 - Get the coordinates of the destination tile
+ ********************************************************************************/
+ auto tile_gid_0 = writer->declare_tile("gid_0", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_1 = writer->declare_tile("gid_1", ckw::TileInfo(ckw::DataType::Int32));
+ auto tile_gid_2 = writer->declare_tile("gid_2", ckw::TileInfo(ckw::DataType::Int32));
+
+ writer->op_get_global_id(tile_gid_0, 0);
+ writer->op_get_global_id(tile_gid_1, 1);
+ writer->op_get_global_id(tile_gid_2, 2);
+
+ auto tile_nout0 = writer->declare_tile("cout0", ckw::TileInfo(ckw::DataType::Int32)); // OFM
+ auto tile_mout0 = writer->declare_tile("mout0", ckw::TileInfo(ckw::DataType::Int32)); // WIDTH or WIDTH x HEIGHT
+ auto tile_mout1 = writer->declare_tile("mout1", ckw::TileInfo(ckw::DataType::Int32)); // HEIGHT or 0
+ auto tile_bout0 = writer->declare_tile("bout0", ckw::TileInfo(ckw::DataType::Int32)); // BATCH SIZE IDX
+
+ // Calculate coordinates
+ get_coordinate_from_gws_overlapping_min(writer, tile_nout0, tile_gid_0, const_n0_i32, const_shift_back_n0_i32,
+ const_0_i32);
+ get_coordinate_from_gws(writer, tile_mout0, tile_gid_1, const_m0_i32);
+
+ // Get the boundary aware coordinates at each global dimension index
+ if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1xDim2_1)
+ {
+ writer->op_assign(tile_mout1, const_0_i32);
+ get_coordinate_from_gws(writer, tile_bout0, tile_gid_2, const_pos_1_i32);
+ }
+ else if (sampler_src.format() == ckw::TensorSamplerFormat::Dim0_Dim1_Dim2)
+ {
+ // For tile_mout1 and tile_bout0 the step can only be 1
+ writer->op_binary(tile_mout1, ckw::BinaryOp::Mod, tile_gid_2, const_dst_h_i32);
+ writer->op_binary(tile_bout0, ckw::BinaryOp::Div, tile_gid_2, const_dst_h_i32);
+ }
+
+ /********************************************************************************
+ * 8 - Write the rest of the code
+ ********************************************************************************/
+ writer->op_store(dst->tensor(), tile_src, sampler_src, tile_nout0, tile_mout0, tile_mout1, tile_bout0);
+}
+
+std::string GpuCkwStore::get_name(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+ return "store";
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
new file mode 100644
index 0000000000..c9ce7eb269
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
+
+#include "src/core/common/Macros.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuCkwStore : public IGpuCkwComponentDriver
+{
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the component
+ */
+ GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwStore);
+ /** Destructor */
+ ~GpuCkwStore() override = default;
+ // Inherited methods overriden:
+ virtual void write_component_code(const ComponentGroup &comp_group,
+ GpuCkwVariableTable &vtable,
+ GpuCkwScopedKernelWriter writer) const override;
+ std::string get_name(const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_GPUCKWSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
new file mode 100644
index 0000000000..1e6f0841ad
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "CkwHelper.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step)
+{
+ writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+}
+
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step,
+ ckw::TileOperand &shift_back,
+ ckw::TileOperand &const_0)
+{
+ // Applied formula: max((gid * step) - shift_back, 0)
+ // where the shift_back operand is: (step - leftover_step) % step
+
+ writer->op_binary(coord, ckw::BinaryOp::Mul, gid, step);
+ writer->op_binary(coord, ckw::BinaryOp::Sub, coord, shift_back);
+ writer->op_binary(coord, ckw::BinaryOp::Max, coord, const_0);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
new file mode 100644
index 0000000000..956e7c8ecb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/CkwHelper.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
+
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Get coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out] coord Resultant coordinate
+ * @param[in] gid Global work item id
+ * @param[in] step Step size / vector size
+ */
+void get_coordinate_from_gws(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step);
+
+/** Get boundary aware coordinate along one axis.
+ *
+ * @param[in,out] writer Writer
+ * @param[out] coord Resultant coordinate
+ * @param[in] gid Global work item id
+ * @param[in] step Step size / vector size
+ * @param[in] shift_back It is (step - leftover_step) % step
+ * @param[in] const_0 Constant tile of value 0
+ */
+void get_coordinate_from_gws_overlapping_min(GpuCkwScopedKernelWriter writer,
+ ckw::TileOperand &coord,
+ const ckw::TileOperand &gid,
+ ckw::TileOperand &step,
+ ckw::TileOperand &shift_back,
+ ckw::TileOperand &const_0);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_CKWHELPER_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
new file mode 100644
index 0000000000..ad31b06362
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Common.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::DataType to_ckw(DataType dt)
+{
+ switch (dt)
+ {
+ case DataType::F32:
+ return ckw::DataType::Fp32;
+ case DataType::F16:
+ return ckw::DataType::Fp16;
+ case DataType::S32:
+ return ckw::DataType::Int32;
+ case DataType::S16:
+ return ckw::DataType::Int16;
+ case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
+ return ckw::DataType::Int8;
+ case DataType::U32:
+ return ckw::DataType::Uint32;
+ case DataType::U16:
+ return ckw::DataType::Uint16;
+ case DataType::U8:
+ case DataType::QASYMM8:
+ return ckw::DataType::Uint8;
+ default:
+ return ckw::DataType::Unknown;
+ }
+}
+
+ckw::TensorShape to_ckw(const TensorShape &shape)
+{
+ ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+ ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
+ /// NOTE: Overflow danger. Use size_t?
+ return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+ static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+ static_cast<int32_t>(shape[4])};
+}
+
+ckw::TensorDataLayout to_ckw(DataLayout dl)
+{
+ switch (dl)
+ {
+ case DataLayout::NHWC:
+ return ckw::TensorDataLayout::Nhwc;
+ case DataLayout::NDHWC:
+ return ckw::TensorDataLayout::Ndhwc;
+ default:
+ return ckw::TensorDataLayout::Unknown;
+ }
+}
+
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
+{
+ return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+ to_ckw(tensor_info.data_layout()), tensor_info.id()};
+}
+
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
+{
+ switch (storage)
+ {
+ case TensorStorageType::ClBufferUint8Ptr:
+ return ckw::TensorStorageType::BufferUint8Ptr;
+ case TensorStorageType::ClImage2dReadOnly:
+ return ckw::TensorStorageType::Texture2dReadOnly;
+ case TensorStorageType::ClImage2dWriteOnly:
+ return ckw::TensorStorageType::Texture2dWriteOnly;
+ case TensorStorageType::Unknown:
+ return ckw::TensorStorageType::Unknown;
+ default:
+ ARM_COMPUTE_ERROR("Unknown tensor storage type");
+ }
+}
+
+TensorComponentType from_ckw(const ckw::TensorComponentType &component)
+{
+ switch (component)
+ {
+ case ckw::TensorComponentType::OffsetFirstElement:
+ return TensorComponentType::OffsetFirstElement;
+ case ckw::TensorComponentType::Stride0:
+ return TensorComponentType::Stride0;
+ case ckw::TensorComponentType::Stride1:
+ return TensorComponentType::Stride1;
+ case ckw::TensorComponentType::Stride2:
+ return TensorComponentType::Stride2;
+ case ckw::TensorComponentType::Stride3:
+ return TensorComponentType::Stride3;
+ case ckw::TensorComponentType::Stride4:
+ return TensorComponentType::Stride4;
+ case ckw::TensorComponentType::Dim0:
+ return TensorComponentType::Dim0;
+ case ckw::TensorComponentType::Dim1:
+ return TensorComponentType::Dim1;
+ case ckw::TensorComponentType::Dim2:
+ return TensorComponentType::Dim2;
+ case ckw::TensorComponentType::Dim3:
+ return TensorComponentType::Dim3;
+ case ckw::TensorComponentType::Dim4:
+ return TensorComponentType::Dim4;
+ case ckw::TensorComponentType::Dim1xDim2:
+ return TensorComponentType::Dim1xDim2;
+ case ckw::TensorComponentType::Dim2xDim3:
+ return TensorComponentType::Dim2xDim3;
+ case ckw::TensorComponentType::Dim1xDim2xDim3:
+ return TensorComponentType::Dim1xDim2xDim3;
+ case ckw::TensorComponentType::Unknown:
+ return TensorComponentType::Unknown;
+ default:
+ ARM_COMPUTE_ERROR("Unknown CKW tensor component");
+ }
+}
+
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
+{
+ switch (storage)
+ {
+ case ckw::TensorStorageType::BufferUint8Ptr:
+ return TensorStorageType::ClBufferUint8Ptr;
+ case ckw::TensorStorageType::Texture2dReadOnly:
+ return TensorStorageType::ClImage2dReadOnly;
+ case ckw::TensorStorageType::Texture2dWriteOnly:
+ return TensorStorageType::ClImage2dWriteOnly;
+ case ckw::TensorStorageType::Unknown:
+ return TensorStorageType::Unknown;
+ default:
+ ARM_COMPUTE_ERROR("Unknown CKW tensor storage type");
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
new file mode 100644
index 0000000000..26740cdd04
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include "compute_kernel_writer/include/ckw/TensorInfo.h"
+#include "compute_kernel_writer/include/ckw/types/DataType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorComponentType.h"
+#include "compute_kernel_writer/include/ckw/types/TensorStorageType.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Convert the Compute Library data type to Compute Kernel Writer data type
+ *
+ * @param[in] dt The Compute Library data type
+ *
+ * @return the Compute Kernel Writer data type (ckw::DataType)
+ */
+ckw::DataType to_ckw(DataType dt);
+
+/** Convert the Compute Library tensor shape to Compute Kernel Writer tensor shape
+ *
+ * @param[in] shape The Compute Library tensor shape
+ *
+ * @return the Compute Kernel Writer tensor shape (ckw::TensorShape)
+ */
+ckw::TensorShape to_ckw(const TensorShape &shape);
+
+/** Convert the Compute Library data layout to Compute Kernel Writer data layout
+ *
+ * @param[in] dl The Compute Library data layout
+ *
+ * @return the Compute Kernel Writer data layout (ckw::TensorDataLayout)
+ */
+ckw::TensorDataLayout to_ckw(DataLayout dl);
+
+/** Convert the Compute Library tensor info to Compute Kernel Writer tensor info
+ *
+ * @param[in] tensor_info The Compute Library tensor info
+ *
+ * @return the Compute Kernel Writer tensor info (ckw::TensorInfo)
+ */
+ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info);
+
+/** Convert the Compute Library tensor storage to Compute Kernel Writer tensor storage
+ *
+ * @param[in] storage The Compute Library tensor storage
+ *
+ * @return the Compute Kernel Writer tensor storate (ckw::TensorStorageType)
+ */
+ckw::TensorStorageType to_ckw(const TensorStorageType &storage);
+
+/** Convert the Compute Kernel Writer tensor component to Compute Library tensor component
+ *
+ * @param[in] component The Compute Kernel Writer tensor component
+ *
+ * @return the Compute Library tensor component
+ */
+TensorComponentType from_ckw(const ckw::TensorComponentType &component);
+
+/** Convert the Compute Kernel Writer tensor storage to Compute Library tensor storage
+ *
+ * @param[in] storage The Compute Kernel Writer tensor storage
+ *
+ * @return the Compute Library tensor storage
+ */
+TensorStorageType from_ckw(const ckw::TensorStorageType &storage);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_COMMON_H
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
new file mode 100644
index 0000000000..5630e390d5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
+{
+ switch (attributes.operation())
+ {
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
+ return ckw::BinaryOp::Add;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub:
+ return ckw::BinaryOp::Sub;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Div:
+ return ckw::BinaryOp::Div;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul:
+ return ckw::BinaryOp::Mul;
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Min:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Max:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Power:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::Prelu:
+ case ElementwiseBinaryCommonAttributes::ElementwiseOp::SquaredDiff:
+ default:
+ ARM_COMPUTE_ERROR("Cannot convert ElementwiseBinaryCommonAttributes to corresponding ckw::BinaryOp");
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
new file mode 100644
index 0000000000..644a407702
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+#include "compute_kernel_writer/include/ckw/types/Operators.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes);
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
new file mode 100644
index 0000000000..ee109a7e2b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "Types.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids
+ */
+class GpuKernelComponentFactory
+{
+public:
+ /** Create a new kernel component
+ *
+ * @tparam T Any polymorphic type descending from @ref IGpuKernelComponent
+ * @tparam Args Argument types to construct the kernel component
+ *
+ * @param[in] args Arguments to construct the kernel component
+ *
+ * @return std::unique_ptr<IGpuKernelComponent>
+ */
+ template <typename T, typename... Args>
+ std::unique_ptr<IGpuKernelComponent> create(Args &&...args)
+ {
+ return std::make_unique<T>(_count++, std::forward<Args>(args)...);
+ }
+
+private:
+ ComponentId _count{0};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
new file mode 100644
index 0000000000..6678c929e9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+#include "Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Properties common to all kernel component types */
+class KernelProperties
+{
+public:
+ KernelProperties &stage(const UnitWorkloadStage &stage)
+ {
+ _stage = stage;
+ return *this;
+ }
+ UnitWorkloadStage stage() const
+ {
+ return _stage;
+ }
+
+private:
+ UnitWorkloadStage _stage{};
+};
+
+inline bool operator==(const KernelProperties &config0, const KernelProperties &config1)
+{
+ return config0.stage() == config1.stage();
+}
+
+/** Forward declaration */
+class IGpuTemplateComponentWriter;
+class IGpuCkwComponentDriver;
+
+/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion
+ */
+class IGpuKernelComponent
+{
+public:
+ using Properties = KernelProperties;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] properties Kernel component properties
+ * @param[in] tensors Tensor arguments to the components
+ */
+ IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+ : _id{id}, _properties{properties}, _tensors{tensors}
+ {
+ }
+ /** Destructor */
+ virtual ~IGpuKernelComponent()
+ {
+ }
+ /** Get component id */
+ ComponentId id() const
+ {
+ return _id;
+ }
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const
+ {
+ return _tensors;
+ }
+ /** Get properties */
+ Properties properties() const
+ {
+ return _properties;
+ }
+ /** Get writer for the component */
+ virtual const IGpuCkwComponentDriver *ckw_component_driver() const
+ {
+ return nullptr;
+ }
+ /** Get component type */
+ virtual GpuComponentType type() const = 0;
+
+private:
+ ComponentId _id{-1};
+ Properties _properties{};
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h
new file mode 100644
index 0000000000..54b3a69057
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/Types.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a kernel component within a workload
+ */
+using ComponentId = int32_t;
+
+/** Component type in the context of fusion
+ * Its main purpose is to inform the optimizer how to perform fusion.
+ */
+enum class GpuComponentType
+{
+ Complex,
+ Simple,
+ Unfusable,
+ Output
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
new file mode 100644
index 0000000000..e316bdf46d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentActivation.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentActivation::validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes)
+{
+ ARM_COMPUTE_UNUSED(properties, attributes);
+
+ const ITensorInfo *const src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ const ITensorInfo *const dst = tensors.get_const_tensor(TensorType::ACL_DST);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+
+ return Status{};
+}
+
+ClComponentActivation::ClComponentActivation(ComponentId id,
+ const IGpuKernelComponent::Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
+{
+}
+
+ClComponentActivation::~ClComponentActivation()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentActivation::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
new file mode 100644
index 0000000000..b8185158f3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class GpuCkwActivation;
+
+class ClComponentActivation final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = ActivationLayerInfo;
+
+ /** Validate the component
+ *
+ * @param[in] properties Component properties @ref Properties
+ * @param[in, out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes @ref Attributes
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC: Input
+ * - ACL_DST: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC: Const
+ * - ACL_DST: Const
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * |ACL_SRC |ACL_DST |
+ * |:--------------|:--------------|
+ * |F16 |F16 |
+ * |F32 |F32 |
+ */
+ static Status
+ validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentActivation::validate()
+ */
+ ClComponentActivation(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes);
+
+ /** Destructor */
+ ~ClComponentActivation() override;
+
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentActivation(const ClComponentActivation &component) = delete;
+
+ /** Prevent instances of this class from being copied */
+ ClComponentActivation &operator=(const ClComponentActivation &component) = delete;
+
+ /** Allow instances of this class to be move constructed */
+ ClComponentActivation(ClComponentActivation &&component) = default;
+
+ /** Allow instances of this class to be moved */
+ ClComponentActivation &operator=(ClComponentActivation &&component) = default;
+
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Simple;
+ }
+
+private:
+ std::unique_ptr<GpuCkwActivation> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
new file mode 100644
index 0000000000..e1850d78c4
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentCast.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentCast::validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_UNUSED(properties, attributes, settings);
+
+ const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(),
+ "input and target data types should be different");
+
+ // Validate in case of configured dst
+ if (dst->total_size() > 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(),
+ "dst and target data types should be same");
+ }
+
+ return Status{};
+}
+ClComponentCast::ClComponentCast(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
+{
+ ARM_COMPUTE_UNUSED(attributes, settings);
+}
+
+ClComponentCast::~ClComponentCast()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentCast::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
new file mode 100644
index 0000000000..201dacc288
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Component specific settings
+ */
+class ClComponentCastSettings
+{
+public:
+private:
+};
+
+/** Forward declaration */
+class GpuCkwCast;
+
+class ClComponentCast final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = CastAttributes;
+ /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+ using Settings = ClComponentCastSettings;
+
+ /** Validate the component
+ *
+ * @param[in] properties Component properties @ref Properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes @ref Attributes
+ * @param[in] settings Component settings @ref Settings
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - All
+ *
+ ** Valid data type configurations:
+ * |ACL_SRC_0 |ACL_DST_0 |
+ * |:--------------|:--------------------------------------|
+ * |U8 | S8, U16, S16, U32, S32, F16, F32 |
+ * |U16 | U8, S8, S16, U32, S32, F16, F32 |
+ * |S16 | U8, S8, U16, U32, S32, F16, F32 |
+ * |U32 | U8, S8, U16, S16, S32, F16, F32 |
+ * |S32 | U8, S8, U16, S16, U32, F16, F32 |
+ * |F16 | U8, S8, U16, S16, U32, S32, F32 |
+ * |F32 | U8, S8, U16, S16, U32, S32, F16 |
+ */
+ static Status validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentCast::validate()
+ */
+ ClComponentCast(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Destructor */
+ ~ClComponentCast() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentCast(const ClComponentCast &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentCast &operator=(const ClComponentCast &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentCast(ClComponentCast &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentCast &operator=(ClComponentCast &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Simple;
+ }
+
+private:
+ std::unique_ptr<GpuCkwCast> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..7cd23d6115
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDepthwiseConv2d.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDepthwiseConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using Settings = ClComponentDepthwiseConv2dSettings;
+
+Settings &Settings::export_input_to_cl_image(bool cl_image)
+{
+ _export_input_to_cl_image = cl_image;
+ return *this;
+}
+
+bool Settings::export_input_to_cl_image() const
+{
+ return _export_input_to_cl_image;
+}
+
+Settings &Settings::export_weights_to_cl_image(bool cl_image)
+{
+ _export_weights_to_cl_image = cl_image;
+ return *this;
+}
+
+bool Settings::export_weights_to_cl_image() const
+{
+ return _export_weights_to_cl_image;
+}
+
+Settings &Settings::fast_relaxed_math(bool fast_relaxed_math)
+{
+ _fast_relaxed_math = fast_relaxed_math;
+ return *this;
+}
+
+bool Settings::fast_relaxed_math() const
+{
+ return _fast_relaxed_math;
+}
+
+Settings &Settings::is_fma_available(bool is_fma_available)
+{
+ _is_fma_available = is_fma_available;
+ return *this;
+}
+
+bool Settings::is_fma_available() const
+{
+ return _is_fma_available;
+}
+
+Settings &Settings::n0(unsigned int n0)
+{
+ _n0 = n0;
+ return *this;
+}
+
+unsigned int Settings::n0() const
+{
+ return _n0;
+}
+
+Settings &Settings::m0(unsigned int m0)
+{
+ _m0 = m0;
+ return *this;
+}
+
+unsigned int Settings::m0() const
+{
+ return _m0;
+}
+
+Status ClComponentDepthwiseConv2d::validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_UNUSED(properties, settings);
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+ // 1. Check validity
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+ }
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+ }
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+ }
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ // wei shape is correct
+ const DataLayout data_layout = src->data_layout();
+ const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+ (src->dimension(channel_idx) * attributes.depth_multiplier()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional");
+
+ // dst shape is correct
+ const PadStrideInfo pad_stride_info =
+ PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right,
+ attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type());
+ const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+ attributes.dilation()};
+ const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+
+ // Check strides and dilation
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1);
+
+ if (conv_info.depth_multiplier > 1 && settings.n0() > 1)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0);
+ }
+
+ // Check export weights to cl image
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) &&
+ (export_to_cl_image(wei) == false),
+ "Weights cannot be exported to cl_image!");
+ ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0));
+
+ ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+ (src->dimension(channel_idx) * conv_info.depth_multiplier));
+
+ // bia shape is correct
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx],
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // 2. Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+ // Texture in the input tensor
+ ARM_COMPUTE_RETURN_ERROR_ON((settings.export_input_to_cl_image() == true));
+
+ return Status{};
+}
+
+ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwDepthwiseConv2d>(id, tensors, attributes, settings)}
+{
+ ARM_COMPUTE_UNUSED(attributes, settings);
+}
+ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()
+{
+}
+const IGpuCkwComponentDriver *ClComponentDepthwiseConv2d::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
new file mode 100644
index 0000000000..7526361f1c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
+
+#include "arm_compute/core/Error.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class DepthwiseConv2dAttributes;
+
+/** Forward declaration */
+class GpuCkwDepthwiseConv2d;
+
+/** Component specific settings
+ */
+class ClComponentDepthwiseConv2dSettings
+{
+public:
+ /** Set export_input_to_cl_image flag */
+ ClComponentDepthwiseConv2dSettings &export_input_to_cl_image(bool cl_image);
+ /** Get export_input_to_cl_image flag */
+ bool export_input_to_cl_image() const;
+
+ /** Set export_weights_to_cl_image flag */
+ ClComponentDepthwiseConv2dSettings &export_weights_to_cl_image(bool cl_image);
+ /** Get export_weights_to_cl_image flag */
+ bool export_weights_to_cl_image() const;
+
+ /** Set fast_relaxed_math flag */
+ ClComponentDepthwiseConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+ /** Get fast_relaxed_math flag */
+ bool fast_relaxed_math() const;
+
+ /** Set is_fma_available flag */
+ ClComponentDepthwiseConv2dSettings &is_fma_available(bool is_fma_available);
+ /** Get is_fma_available flag */
+ bool is_fma_available() const;
+
+ /** Set N0: number of columns processed by each thread */
+ ClComponentDepthwiseConv2dSettings &n0(unsigned int n0);
+ /** Get N0: number of columns processed by each thread */
+ unsigned int n0() const;
+
+ /** Set M0: number of rows processed by each thread */
+ ClComponentDepthwiseConv2dSettings &m0(unsigned int m0);
+ /** Set M0: number of rows processed by each thread */
+ unsigned int m0() const;
+
+private:
+ bool _export_input_to_cl_image{false}; /**< Export input to cl_image */
+ bool _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
+ bool _fast_relaxed_math{true}; /**< Enable/disable -cl-fast-relaxed-math flag */
+ bool _is_fma_available{false}; /**< Is fma instruction available */
+ unsigned int _n0{0}; /**< Number of columns processed by each thread */
+ unsigned int _m0{0}; /**< Number of rows processed by each thread */
+};
+
+/** Forward declaration */
+class ClTemplateDepthwiseConv2d;
+
+class ClComponentDepthwiseConv2d final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = DepthwiseConv2dAttributes;
+ /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+ using Settings = ClComponentDepthwiseConv2dSettings;
+
+public:
+ /** Validate the component
+ *
+ * @param[in] properties Component properties @ref Properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes @ref Attributes
+ * @param[in] settings Component settings @ref Settings
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_SRC_1: Weight
+ * - ACL_SRC_2: Bias (Optional)
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_SRC_1: Const
+ * - ACL_SRC_2: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 |
+ * |:--------------|:--------------|:--------------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ */
+ static Status validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentDepthwiseConv2d::validate()
+ */
+ ClComponentDepthwiseConv2d(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Destructor */
+ ~ClComponentDepthwiseConv2d() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentDepthwiseConv2d(const ClComponentDepthwiseConv2d &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentDepthwiseConv2d &operator=(const ClComponentDepthwiseConv2d &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+ std::unique_ptr<GpuCkwDepthwiseConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
new file mode 100644
index 0000000000..783a17df30
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool ClComponentDirectConv2dSettings::export_to_cl_image() const
+{
+ return _desc.export_weights_to_cl_image;
+}
+
+ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math)
+{
+ _fast_relaxed_math = fast_relaxed_math;
+ return *this;
+}
+
+bool ClComponentDirectConv2dSettings::fast_relaxed_math() const
+{
+ return _fast_relaxed_math;
+}
+
+ClComponentDirectConv2dSettings &
+ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
+{
+ _desc = desc;
+ return *this;
+}
+
+DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descriptor() const
+{
+ return _desc;
+}
+
+Status ClComponentDirectConv2d::validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_UNUSED(properties);
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+ // 1. Check validity
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+ }
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+ }
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+ }
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ // wei shape is correct
+ const DataLayout data_layout = src->data_layout();
+ const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx),
+ "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+ // dst shape is correct
+ PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+ DimensionRoundingType{});
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+
+ // bia shape is correct
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
+ }
+
+ // 2. Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ const auto desc = settings.direct_conv_descriptor();
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+ desc.n0 != 16,
+ "N0 can only be: 1, 2, 3, 4, 8, and 16");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+ desc.k0 != 16,
+ "K0 can only be: 1, 2, 3, 4, 8, and 16");
+ return Status{};
+}
+
+ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
+{
+}
+
+ClComponentDirectConv2d::~ClComponentDirectConv2d()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
new file mode 100644
index 0000000000..c50b0fa0ce
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Conv2dAttributes;
+
+/** Component specific settings
+ */
+class ClComponentDirectConv2dSettings
+{
+public:
+ /** Get export_to_cl_image flag */
+ bool export_to_cl_image() const;
+
+ /** Set fast_relaxed_math flag */
+ ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+ /** Get fast_relaxed_math flag */
+ bool fast_relaxed_math() const;
+
+ /** Set direct convolution descriptor */
+ ClComponentDirectConv2dSettings &direct_conv_descriptor(const DirectConvComputeKernelInfo &desc);
+ /** Get direct convolution descriptor */
+ DirectConvComputeKernelInfo direct_conv_descriptor() const;
+
+private:
+ bool _fast_relaxed_math{true};
+ DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor
+};
+
+/** Forward declaration */
+class GpuCkwDirectConv2d;
+
+class ClComponentDirectConv2d final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = Conv2dAttributes;
+ /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+ using Settings = ClComponentDirectConv2dSettings;
+
+public:
+ /** Validate the component
+ *
+ * @param[in] properties Component properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_SRC_1: Weight
+ * - ACL_SRC_2: Bias (Optional)
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_SRC_1: Const
+ * - ACL_SRC_2: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 |
+ * |:--------------|:--------------|:--------------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ */
+ static Status validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentDirectConv2d::validate()
+ */
+ ClComponentDirectConv2d(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Destructor */
+ ~ClComponentDirectConv2d() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+ std::unique_ptr<GpuCkwDirectConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
new file mode 100644
index 0000000000..209c73dbee
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentElementwiseBinary.h"
+
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops{
+ ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
+ ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul};
+}
+
+Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &tensors,
+ const ElementwiseBinaryCommonAttributes &attributes)
+{
+ const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ // Check operator type
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(),
+ "Provided Elementwise operation not supported.");
+
+ // Check validity
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+ //Check data type for different elementwise operators
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32,
+ DataType::S16, DataType::U8);
+
+ // dst shape is correct
+ const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+ "Wrong shape for dst.");
+
+ const auto &lhs_shape = lhs->tensor_shape();
+ const auto &rhs_shape = rhs->tensor_shape();
+ const auto &dst_shape = dst->tensor_shape();
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) &&
+ detail::have_different_dimensions(rhs_shape, dst_shape, 0),
+ "Only LHS or RHS can be broadcasting, not both.");
+
+ // Dimension Y and Z are collapsed together in the current kernel implementation,
+ // hence they cannot be independently broadcast or non-broadcast.
+ // See: ClTemplateElementwiseBinary::get_window
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) !=
+ (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
+ "Dimension Y and Z must both be either broadcast or non-broadcast.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3),
+ "LHS broadcast in dimension 3 or higher is not supported.");
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3),
+ "RHS broadcast in dimension 3 or higher is not supported.");
+
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst);
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+ return Status{};
+}
+
+ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
+{
+}
+ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentElementwiseBinary::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
new file mode 100644
index 0000000000..a4395a6219
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class GpuCkwElementwiseBinary;
+
+class ClComponentElementwiseBinary final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = ElementwiseBinaryCommonAttributes;
+
+public:
+ /** Validate the component
+ *
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: lhs
+ * - ACL_SRC_1: rhs
+ * - ACL_DST_0: dst
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_SRC_1: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations (for DIV FP32/FP16/S32 supported, for POWER only FP32/FP16 supported):
+ * |ACL_SRC_0 |ACL_SRC_1 |ACL_DST_0 |
+ * |:--------------|:--------------|:--------------|
+ * |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |
+ * |S32 |S32 |S32 |
+ * |S16 |S16 |S16 |
+ * |U8 |U8 |U8 |
+ */
+ static Status validate(const ArgumentPack<ITensorInfo> &tensors,
+ const ElementwiseBinaryCommonAttributes &attributes);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentElementwiseBinary::validate()
+ */
+ ClComponentElementwiseBinary(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes);
+
+ /** Destructor */
+ ~ClComponentElementwiseBinary() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentElementwiseBinary(const ClComponentElementwiseBinary &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentElementwiseBinary &operator=(const ClComponentElementwiseBinary &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Simple;
+ }
+
+private:
+ std::unique_ptr<GpuCkwElementwiseBinary> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
new file mode 100644
index 0000000000..53ac8da41f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwMatMul.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+using Attributes = MatMulAttributes;
+using Settings = GpuMatMulSettings;
+
+Status validate_matmul_kernel_info(Attributes attributes, Settings settings)
+{
+ const bool adj_lhs = attributes.adj_lhs();
+ const bool adj_rhs = attributes.adj_rhs();
+ const int m0 = settings.m0();
+ const int n0 = settings.n0();
+ const int k0 = settings.k0();
+
+ // Validate M0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+ if (adj_lhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+ "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+ }
+
+ // Validate N0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+ "Only 1,2,3,4,8,16 are supported for N0");
+
+ // Validate K0
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+ if (!adj_lhs || adj_rhs)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+ "Only 1,2,3,4,8,16 are supported for K0");
+ }
+
+ return Status{};
+}
+
+} // namespace
+
+Status ClComponentMatMul::validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_UNUSED(properties);
+ ARM_COMPUTE_UNUSED(attributes);
+
+ const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ // Currently, the only supported case is when adj_lhs = false and adj_rhs = true
+ ARM_COMPUTE_RETURN_ERROR_ON((attributes.adj_lhs() != false) && (attributes.adj_rhs() != true));
+
+ // Check if Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+ // Check if block sizes are supported
+ MatMulKernelInfo matmul_kernel_info =
+ MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0());
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(attributes, settings));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ opencl::kernels::validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+
+ // Check if dst shape is correct
+ const auto expected_dst_shape =
+ misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), expected_dst_shape);
+
+ return Status{};
+}
+
+ClComponentMatMul::ClComponentMatMul(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwMatMul>(id, tensors, attributes, settings)}
+{
+}
+
+ClComponentMatMul::~ClComponentMatMul()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentMatMul::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h
new file mode 100644
index 0000000000..41833e4adb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class MatMulAttributes;
+class GpuCkwMatMul;
+
+class ClComponentMatMul final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = MatMulAttributes;
+ /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+ using Settings = GpuMatMulSettings;
+
+ /** Validate the component
+ *
+ * @param[in] properties Component properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: LHS
+ * - ACL_SRC_1: RHS
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_SRC_1: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_SRC_1 |ACL_DST_0 |
+ * |:--------------|:--------------|:--------------|
+ * |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |
+ */
+ static Status validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentMatMul::validate()
+ */
+ ClComponentMatMul(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ /** Destructor */
+ ~ClComponentMatMul() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentMatMul(const ClComponentMatMul &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentMatMul &operator=(const ClComponentMatMul &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentMatMul(ClComponentMatMul &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentMatMul &operator=(ClComponentMatMul &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+ std::unique_ptr<GpuCkwMatMul> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTMATMUL_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
new file mode 100644
index 0000000000..6e7243dc04
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentPool2d.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentPool2d::validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_UNUSED(properties, settings);
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX),
+ "Unsupported Pooling type");
+
+ // 1. Check validity
+ // Check if pooling is valid
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, true)),
+ "Pooling region that is entirely outside input tensor is unsupported");
+
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ dst->tensor_shape(),
+ misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, true)));
+
+ // 2. Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ return Status{};
+}
+
+ClComponentPool2d::ClComponentPool2d(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
+{
+}
+ClComponentPool2d::~ClComponentPool2d()
+{
+}
+const IGpuCkwComponentDriver *ClComponentPool2d::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
new file mode 100644
index 0000000000..d33e601f18
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Pool2dAttributes;
+
+/** Forward declaration */
+class GpuCkwPool2d;
+
+class ClComponentPool2d final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = Pool2dAttributes;
+ /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+ using Settings = GpuPool2dSettings;
+
+public:
+ /** Validate the component
+ *
+ * @param[in] properties Component properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_DST_0 |
+ * |:--------------|:--------------|
+ * |F16 |F16 |
+ * |F32 |F32 |
+ */
+ static Status validate(const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Constructor
+ *
+ * @param[in] id Unique Component Identifier within a workload
+ * @param[in] properties Component properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ */
+ ClComponentPool2d(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Destructor */
+ ~ClComponentPool2d() override;
+
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentPool2d(const ClComponentPool2d &component) = delete;
+
+ /** Prevent instances of this class from being copied */
+ ClComponentPool2d &operator=(const ClComponentPool2d &component) = delete;
+
+ /** Allow instances of this class to be move constructed */
+ ClComponentPool2d(ClComponentPool2d &&component) = default;
+
+ /** Allow instances of this class to be moved */
+ ClComponentPool2d &operator=(ClComponentPool2d &&component) = default;
+
+ /** Get GPU kernel writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+ std::unique_ptr<GpuCkwPool2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
new file mode 100644
index 0000000000..dce85c424e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentReshape.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentReshape::validate(const ArgumentPack<ITensorInfo> &tensors)
+{
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
+
+ return Status{};
+}
+
+ClComponentReshape::ClComponentReshape(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors)
+ : IGpuKernelComponent{id, properties, tensors}
+{
+}
+ClComponentReshape::~ClComponentReshape()
+{
+}
+const IGpuCkwComponentDriver *ClComponentReshape::ckw_component_driver() const
+{
+ /* NOT IMPLEMENTED */
+ return nullptr;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
new file mode 100644
index 0000000000..fd0f966da1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class ClTemplateReshape;
+
+class ClComponentReshape final : public IGpuKernelComponent
+{
+public:
+public:
+ /** Validate the component
+ *
+ * @param[in,out] tensors Tensor arguments to the component
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: src
+ * - ACL_DST_0: dst
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - All
+ *
+ * Valid data type configurations:
+ * - All
+ */
+ static Status validate(const ArgumentPack<ITensorInfo> &tensors);
+
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] properties Component properties @ref Properties
+ * @param[in] tensors Tensor arguments to the component
+ */
+ ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+
+ /** Destructor */
+ ~ClComponentReshape() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentReshape(const ClComponentReshape &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentReshape &operator=(const ClComponentReshape &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentReshape(ClComponentReshape &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentReshape &operator=(ClComponentReshape &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESHAPE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
new file mode 100644
index 0000000000..411eeca802
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ClComponentResize.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuCkwResize;
+
+Status ClComponentResize::validate(const IGpuKernelComponent::Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const ClComponentResize::Attributes &attributes)
+{
+ ARM_COMPUTE_UNUSED(properties);
+
+ const ITensorInfo *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const ITensorInfo *dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ // Mismatching data types and quantization info
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+
+ // Device requirements met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+ // Align corners and sampling policy conformance
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ attributes.align_corners() &&
+ !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+ return Status();
+}
+
+ClComponentResize::ClComponentResize(ComponentId id,
+ const IGpuKernelComponent::Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const ClComponentResize::Attributes &attributes)
+ : IGpuKernelComponent{id, properties, tensors},
+ _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
+{
+}
+
+ClComponentResize::~ClComponentResize()
+{
+}
+
+const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
new file mode 100644
index 0000000000..9a1169c45f
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+/** Forward declaration */
+class GpuCkwResize;
+
+class ClComponentResize final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = ResizeAttributes;
+
+ /** Validate the component
+ *
+ * @param[in] properties Component properties @ref Properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes @ref Attributes
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ ** Valid data type configurations:
+ * |ACL_SRC_0 |ACL_DST_0 |
+ * |:--------------|:--------------|
+ * |QASYMM8 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+ * |F16 |F16 |
+ * |F32 |F32 |
+ * |U8 |U8 |
+ * |S16 |S16 |
+ */
+ static Status
+ validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentResize::validate()
+ */
+ ClComponentResize(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes);
+
+ /** Destructor */
+ ~ClComponentResize() override;
+
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentResize(const ClComponentResize &component) = delete;
+
+ /** Prevent instances of this class from being copied */
+ ClComponentResize &operator=(const ClComponentResize &component) = delete;
+
+ /** Allow instances of this class to be move constructed */
+ ClComponentResize(ClComponentResize &&component) = default;
+
+ /** Allow instances of this class to be moved */
+ ClComponentResize &operator=(ClComponentResize &&component) = default;
+
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+ std::unique_ptr<GpuCkwResize> _component_writer;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
new file mode 100644
index 0000000000..3db6c5cd2d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentStore.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentStore::validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+{
+ ARM_COMPUTE_UNUSED(properties, tensors);
+ return Status{};
+}
+ClComponentStore::ClComponentStore(ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors)
+ : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
+{
+}
+ClComponentStore::~ClComponentStore()
+{
+}
+const IGpuCkwComponentDriver *ClComponentStore::ckw_component_driver() const
+{
+ return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
new file mode 100644
index 0000000000..2c1dd0f6fc
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
+
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class GpuCkwStore;
+
+class ClComponentStore final : public IGpuKernelComponent
+{
+public:
+ /** Validate the component
+ *
+ * @param[in] properties Component properties
+ * @param[in] tensors Tensor arguments to the components
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_DST_0 |
+ * |:--------------|:--------------|
+ * |All |All |
+ */
+ static Status validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+ /** Constructor
+ *
+ * Similar to @ref ClComponentStore::validate()
+ */
+ ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+ /** Destructor */
+ ~ClComponentStore() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentStore(const ClComponentStore &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentStore &operator=(const ClComponentStore &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentStore(ClComponentStore &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentStore &operator=(ClComponentStore &&component) = default;
+ /** Get writer for the component */
+ const IGpuCkwComponentDriver *ckw_component_driver() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Output;
+ }
+
+private:
+ std::unique_ptr<GpuCkwStore> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE_H
diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
new file mode 100644
index 0000000000..4c3e84e59d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY
+#define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY
+
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+
+#include <ostream>
+#include <sstream>
+#include <string>
+
+namespace arm_compute
+{
+/** Type printers for all types related to the component @ref ClComponentElementwiseBinary
+ */
+
+using namespace experimental::dynamic_fusion;
+
+/** Formatted output of the pute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type.
+ *
+ * @param[out] os Output stream.
+ * @param[in] op arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type to output.
+ *
+ * @return Modified output stream.
+ */
+inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
+{
+ const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name = {
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"},
+ {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}};
+ os << op_name.at(op);
+ return os;
+}
+/** Formatted output of the arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type.
+ *
+ * @param[in] op arm_compute::experimental::dynamic_fusion::ClComponentElementwiseBinary::Attributes::ElementwiseOp type to output.
+ *
+ * @return Formatted string.
+ */
+inline std::string to_string(const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
+{
+ std::stringstream str;
+ str << op;
+ return str.str();
+}
+} // namespace arm_compute
+#endif /* ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_UTILS_TYPE_PRINTER_ELEMENTWISEBINARY */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
new file mode 100644
index 0000000000..201c9f243c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+ DataType::S16, DataType::S32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+ // Set the elementwise operation to Add then call the elementwise common validate_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
+ return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+ // Set the elementwise operation to Add then call the elementwise common is_supported_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
+ return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
+{
+ // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
+ // Set the elementwise operation to Add then call the elementwise common create_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Add);
+ return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
new file mode 100644
index 0000000000..d25a2a3153
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const CastAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ auto_init_if_empty(dst_info_to_validate, src->clone()->set_data_type(attributes.data_type()));
+
+ // Check support level
+ // Data Type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::F16, DataType::F32);
+
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+ // Validate Cast Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+ auto settings = ClComponentCast::Settings();
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentCast::validate(properties, arguments, attributes, settings));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+
+ return Status{};
+}
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status
+GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes)
+{
+ return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ auto_init_if_empty(dst_info_to_validate, src->clone()->set_data_type(attributes.data_type()));
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_LOG_PARAMS(src, attributes);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuCast::validate_op(sketch, src, attributes));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor info if empty
+ auto_init_if_empty(*dst, src->clone()->set_data_type(attributes.data_type()));
+
+ // Translate into components and add to component graph
+ GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+ const auto *sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
+
+ // Add Depthwise Conv2d Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+ auto settings = ClComponentCast::Settings();
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentCast>(properties, arguments, attributes, settings);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
new file mode 100644
index 0000000000..4d6e7f81bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const ClampAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(),
+ "Maximum clamp value cannot be lower than minimum value");
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
+ const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ attributes.max_val(), attributes.min_val()};
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ // Validate Activation Component
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC, src);
+ arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status
+GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes)
+{
+ return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+ // Check if tensors have valid id, i.e. they are created from a sketch
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ // Perform fusion test to check if the operator meets fusion constraints
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC, src);
+ tensors.add_const_tensor(ACL_DST, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_LOG_PARAMS(src, attributes);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuClamp::validate_op(sketch, src, attributes));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor
+ auto_init_if_empty(*dst, *src->clone());
+
+ // Translate into components and add to component graph
+ GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+
+ // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
+ const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ attributes.max_val(), attributes.min_val()};
+
+ const auto *const sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ // Add Activation Component
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC, src);
+ arguments.add_const_tensor(ACL_DST, dst);
+ comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC, src);
+ tensors.add_const_tensor(ACL_DST, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
new file mode 100644
index 0000000000..aaeec543f8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+{
+ // Get GPU target
+ GPUTarget gpu_target = CLScheduler::get().target();
+
+ std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t =
+ arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
+
+ return t->configure(src, weights, conv_info);
+}
+
+void calculate_and_init_dst_if_empty(ITensorInfo *dst,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const Conv2dAttributes &attributes)
+{
+ if (dst->total_size() == 0U)
+ {
+ const auto shape = misc::shape_calculator::compute_deep_convolution_shape(
+ src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
+ PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+ DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+ }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+* when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const ITensorInfo *dst,
+ const Conv2dAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+ // Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ // Check components
+ const auto gpu_target = context.gpu_target();
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+ // Validate Direct Conv2d Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+ auto settings = ClComponentDirectConv2d::Settings();
+
+ settings.fast_relaxed_math(
+ (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+ dst_info_to_validate_ptr->data_type() == DataType::F16));
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_SRC_1, wei);
+ arguments.add_const_tensor(ACL_SRC_2, bia);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuConv2d::is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const Conv2dAttributes &attributes)
+{
+ return is_supported_op_helper(context, src, wei, bia, nullptr, attributes);
+}
+
+Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const Conv2dAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
+
+ // Check if tensors have valid id. I.e. they are created from a sketch
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+ }
+
+ // This tensor info will have invalid id but because all the existing tensors in the
+ // sketch have valid ids and the DependencyGraph implementation has no notion of validness
+ // regarding tensor ids, it'll be just another tensor id and will validate
+ // Additionally, a new dst id is added every time in create_op, thus there's no need to validate it
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+ // Perform fusion test
+ // Check if operator meets fusion constraints
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_SRC_1, wei);
+ tensors.add_const_tensor(ACL_SRC_2, bia);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuConv2d::create_op(
+ GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes)
+{
+ ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
+ PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+ DimensionRoundingType::FLOOR);
+ // Initialize the direct convolution descriptor
+ const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info);
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+
+ // Assert validation
+ ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, attributes));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst);
+
+ // Auto initialize dst tensor
+ calculate_and_init_dst_if_empty(dst, src, wei, attributes);
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+
+ const auto sketch_ctx = sketch.implementation().context();
+
+ const auto gpu_target = sketch_ctx->gpu_target();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+ ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+ ARM_COMPUTE_UNUSED(cl_compile_ctx);
+
+ // Add Direct Conv2d Component
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ auto settings = ClComponentDirectConv2d::Settings();
+
+ settings.fast_relaxed_math(
+ (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+
+ settings.direct_conv_descriptor(desc);
+
+ if (settings.export_to_cl_image())
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+ }
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_SRC_1, wei);
+ arguments.add_const_tensor(ACL_SRC_2, bia);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_SRC_1, wei);
+ tensors.add_const_tensor(ACL_SRC_2, bia);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
new file mode 100644
index 0000000000..e2b673bd43
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const DepthwiseConv2dAttributes &attributes)
+{
+ if (dst->total_size() == 0U)
+ {
+ const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+ attributes.dimension_rounding_type());
+
+ const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+ attributes.dilation()};
+ const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+ }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+* when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const ITensorInfo *dst,
+ const DepthwiseConv2dAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+ // Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ const GpuTarget gpu_target = context.gpu_target();
+
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const CLCompileContext *cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Validate Depthwise Conv2d Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+ auto settings = ClComponentDepthwiseConv2d::Settings();
+
+ const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+ attributes.pad().left, attributes.pad().right, attributes.pad().top,
+ attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+ // Get the depthwise convolution compute parameters
+ auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_info =
+ t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+
+ settings.fast_relaxed_math(
+ (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+ (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+ dst_info_to_validate_ptr->data_type() == DataType::F16));
+
+ settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+ .m0(dwc_info.m0)
+ .n0(dwc_info.n0)
+ .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+ .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_SRC_1, wei);
+ arguments.add_const_tensor(ACL_SRC_2, bia);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const DepthwiseConv2dAttributes &attributes)
+{
+ return is_supported_op_helper(context, src, wei, bia, nullptr, attributes);
+}
+
+Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const DepthwiseConv2dAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
+
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
+
+ if (bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+ }
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes);
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_SRC_1, wei);
+ tensors.add_const_tensor(ACL_SRC_2, bia);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *src,
+ ITensorInfo *wei,
+ ITensorInfo *bia,
+ const DepthwiseConv2dAttributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei);
+ ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuDepthwiseConv2d::validate_op(sketch, src, wei, bia, attributes));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ calculate_and_init_dst_if_empty(dst, src, wei, attributes);
+
+ // Translate into components and add to component graph
+ GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+ const auto *sketch_ctx = sketch.implementation().context();
+ const GpuTarget gpu_target = sketch_ctx->gpu_target();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
+
+ // Add Depthwise Conv2d Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+ auto settings = ClComponentDepthwiseConv2d::Settings();
+
+ const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+ attributes.pad().left, attributes.pad().right, attributes.pad().top,
+ attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+ // Get the depthwise convolution compute parameters
+ auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_info =
+ t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+
+ settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
+ .m0(dwc_info.m0)
+ .n0(dwc_info.n0)
+ .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+ .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+
+ if (settings.export_input_to_cl_image())
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src);
+ }
+
+ if (settings.export_weights_to_cl_image())
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+ }
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_SRC_1, wei);
+ arguments.add_const_tensor(ACL_SRC_2, bia);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentDepthwiseConv2d>(properties, arguments, attributes, settings);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_SRC_1, wei);
+ tensors.add_const_tensor(ACL_SRC_2, bia);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
new file mode 100644
index 0000000000..2997b28ec1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMatMul.cpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentMatMul.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulAttributes &attributes,
+ const GpuMatMulSettings &settings)
+{
+ ARM_COMPUTE_UNUSED(attributes);
+
+ if (dst->total_size() == 0U)
+ {
+ const auto dst_shape = misc::shape_calculator::compute_matmul_shape(
+ lhs->tensor_shape(), rhs->tensor_shape(),
+ MatMulKernelInfo(attributes.adj_lhs(), attributes.adj_rhs(), settings.m0(), settings.n0(), settings.k0()));
+
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(dst_shape));
+ }
+}
+
+/* A helper method to reduce the duplication in dst tensor initialization
+* when calling validate()
+*/
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const MatMulAttributes &attributes,
+ const GpuMatMulSettings &settings)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings);
+
+ // Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+ // Validate MatMul Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, lhs);
+ arguments.add_const_tensor(ACL_SRC_1, rhs);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentMatMul::validate(properties, arguments, attributes, settings));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+int GpuMatMulSettings::n0() const
+{
+ return _n0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::n0(int n0)
+{
+ _n0 = n0;
+ return *this;
+}
+
+int GpuMatMulSettings::m0() const
+{
+ return _m0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::m0(int m0)
+{
+ _m0 = m0;
+ return *this;
+}
+
+int GpuMatMulSettings::k0() const
+{
+ return _k0;
+}
+
+GpuMatMulSettings &GpuMatMulSettings::k0(int k0)
+{
+ _k0 = k0;
+ return *this;
+}
+
+Status GpuMatMul::is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulAttributes &attributes,
+ const GpuMatMulSettings &settings)
+{
+ return is_supported_op_helper(context, lhs, rhs, nullptr, attributes, settings);
+}
+
+Status GpuMatMul::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulAttributes &attributes,
+ const GpuMatMulSettings &settings)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+ // Check if tensors have valid id. I.e. they are created from a sketch
+ ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id());
+
+ // Refer to GpuMatmul::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs, attributes, settings);
+
+ // Perform fusion test
+ // Check if operator meets fusion constraints
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, lhs);
+ tensors.add_const_tensor(ACL_SRC_1, rhs);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes, settings);
+}
+
+ITensorInfo *GpuMatMul::create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs, attributes, settings);
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+
+ // Assert validation
+ ARM_COMPUTE_ERROR_THROW_ON(GpuMatMul::validate_op(sketch, lhs, rhs, attributes, settings));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+ // Auto initialize dst tensor
+ calculate_and_init_dst_if_empty(dst, lhs, rhs, attributes, settings);
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+ const auto sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, lhs);
+ arguments.add_const_tensor(ACL_SRC_1, rhs);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentMatMul>(properties, arguments, attributes, settings);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, lhs);
+ tensors.add_const_tensor(ACL_SRC_1, rhs);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
new file mode 100644
index 0000000000..b871171e8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+ // Set the elementwise operation to Mul then call the elementwise common validate_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+ return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+ // Set the elementwise operation to Mul then call the elementwise common is_supported_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+ return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
+{
+ // Set the elementwise operation to Mul then call the elementwise common create_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul);
+ return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
new file mode 100644
index 0000000000..f0d368d757
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+ // Initialize the destination tensor info.
+ TensorInfo dst_to_validate = *dst;
+ auto_init_if_empty(dst_to_validate, *src);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, &dst_to_validate);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, &dst_to_validate);
+
+ ARM_COMPUTE_UNUSED(context);
+ return Status{};
+}
+
+Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_alloc_tensor(dst));
+
+ // Initialize the destination tensor info.
+ TensorInfo dst_to_validate = *dst;
+ auto_init_if_empty(dst_to_validate, *src);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, &dst_to_validate);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, &dst_to_validate);
+
+ // Perform fusion test.
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, &dst_to_validate);
+
+ const auto group = sketch.implementation().operator_group();
+ const auto op = group.new_operator(operator_type, tensors);
+ const auto success = group.try_add_operator(op, true);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!success, "This operator cannot be fused into the workload.");
+
+ const auto status = is_supported_op(*sketch.gpu_context(), src, dst);
+ return status;
+}
+
+void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst)
+{
+ ARM_COMPUTE_LOG_PARAMS(src, dst);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst));
+
+ // Auto initialize dst tensor info if empty
+ auto_init_if_empty(*dst, *src);
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+ const auto sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
+
+ // Add store component
+ {
+ IGpuKernelComponent::Properties properties;
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentStore>(properties, arguments);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op, true);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
new file mode 100644
index 0000000000..2d04f75610
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/utils/Utils.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst,
+ const ITensorInfo *src,
+ const Pool2dAttributes &attributes,
+ const GpuPool2dSettings &settings)
+{
+ ARM_COMPUTE_UNUSED(settings);
+
+ if (dst->total_size() == 0U)
+ {
+ auto shape = misc::shape_calculator::compute_pool_shape(
+ *src, convert_pool_attr_to_pool_info(attributes, /* mixed_precision */ true));
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+ }
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+GpuPool2dSettings GpuPool2dSettings::use_inf_as_limit(bool use_inf_as_limit)
+{
+ _use_inf_as_limit = use_inf_as_limit;
+ return *this;
+}
+
+bool GpuPool2dSettings::use_inf_as_limit() const
+{
+ return _use_inf_as_limit;
+}
+
+Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *src,
+ const Pool2dAttributes &attributes,
+ const GpuPool2dSettings &settings)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Auto initialize dst tensor info
+ TensorInfo dst_info_to_validate;
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op(*sketch.gpu_context(), src, attributes, settings);
+}
+
+Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const Pool2dAttributes &attributes,
+ const GpuPool2dSettings &settings)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+ // Check exclude padding is not false
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(),
+ "Exclude padding must be set to true in Attributes!");
+
+ // Auto initialize dst tensor info
+ TensorInfo dst_info_to_validate;
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Validate Component
+ {
+ const KernelProperties properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentPool2d::validate(properties, arguments, attributes, settings));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *src,
+ const Pool2dAttributes &attributes,
+ const GpuPool2dSettings &settings)
+{
+ // Assert validation
+ ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings));
+ ARM_COMPUTE_LOG_PARAMS(src, attributes, settings);
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor
+ calculate_and_init_dst_if_empty(dst, src, attributes, settings);
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+
+ const auto sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+ ARM_COMPUTE_UNUSED(cl_compile_ctx);
+ ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Add Component
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentPool2d>(properties, arguments, attributes, settings);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_tensor(ACL_DST_0, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
new file mode 100644
index 0000000000..0e1f16e8ff
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const ReshapeAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Validate GpuReshape Component
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentReshape::validate(arguments));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+
+ return Status{ErrorCode::RUNTIME_ERROR, "GpuReshape is not Supported"};
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status
+GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
+{
+ return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_LOG_PARAMS(src, attributes.shape());
+ ARM_COMPUTE_ERROR_THROW_ON(GpuReshape::validate_op(sketch, src, attributes));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(attributes.shape()));
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+ const auto sketch_ctx = sketch.implementation().context();
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+ ARM_COMPUTE_UNUSED(cl_compile_ctx);
+ ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Add ElementwiseBinary Component
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentReshape>(properties, arguments);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_tensor(ACL_DST_0, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
new file mode 100644
index 0000000000..8e794c88b2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes)
+{
+ if (dst->total_size() == 0U)
+ {
+ TensorShape out_shape = src->tensor_shape();
+
+ out_shape.set(1, attributes.output_width());
+ out_shape.set(2, attributes.output_height());
+
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+ }
+}
+
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const ResizeAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes);
+
+ // Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+ // Interpolation policy
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR &&
+ attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
+ "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR");
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Validate Activation Component
+ {
+ const KernelProperties properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentResize::validate(properties, arguments, attributes));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status
+GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
+{
+ return is_supported_op_helper(context, src, nullptr, attributes);
+}
+
+Status
+GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes);
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_LOG_PARAMS(src, attributes);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuResize::validate_op(sketch, src, attributes));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor info if empty
+ calculate_and_init_dst_if_empty(dst, src, attributes);
+
+ // Translate into components and add to component graph
+ GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+ const auto *sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
+
+ // Add Resize Component
+ {
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentResize>(properties, arguments, attributes);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
new file mode 100644
index 0000000000..a2260c8c36
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ // Validate Activation Component
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC, src);
+ arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
+{
+ return is_supported_op_helper(context, src, nullptr);
+}
+
+Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+ // Check if tensors have valid id, i.e. they are created from a sketch
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ // Perform fusion test to check if the operator meets fusion constraints
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC, src);
+ tensors.add_const_tensor(ACL_DST, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
+}
+
+ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_LOG_PARAMS(src);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuSigmoid::validate_op(sketch, src));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor
+ auto_init_if_empty(*dst, *src->clone());
+
+ // Translate into components and add to component graph
+ GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+
+ const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+ const auto *const sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ // Add Activation Component
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC, src);
+ arguments.add_const_tensor(ACL_DST, dst);
+ comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC, src);
+ tensors.add_const_tensor(ACL_DST, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
new file mode 100644
index 0000000000..d385752201
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
+#include "arm_compute/core/Error.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+GpuOperatorType operator_type = GpuOperatorType::Unfusable;
+} // namespace
+
+Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Attributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ if (dst != nullptr)
+ {
+ dst_info_to_validate = *dst;
+ }
+ else
+ {
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+ }
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+ const KernelProperties properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ TensorShape logits_sum_shape = src->tensor_shape();
+ TensorInfo logits(src->clone()->set_tensor_shape(logits_sum_shape));
+
+ // The sum tensor dim0 only need one element
+ logits_sum_shape.set(0, 1);
+ TensorInfo sum(src->clone()->set_tensor_shape(logits_sum_shape));
+
+ // Validate Component
+ ArgumentPack<ITensorInfo> arguments_exp_sum;
+ ArgumentPack<ITensorInfo> arguments_norm;
+
+ arguments_exp_sum.add_const_tensor(ACL_SRC_0, src);
+ arguments_exp_sum.add_const_tensor(ACL_DST_0, &sum);
+ arguments_exp_sum.add_const_tensor(ACL_DST_1, &logits);
+
+ arguments_norm.add_const_tensor(ACL_SRC_0, &logits);
+ arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
+ arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+ ARM_COMPUTE_UNUSED(properties, attributes);
+ return Status(ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not implemented");
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+
+ return Status{ErrorCode::RUNTIME_ERROR, "GpuSoftmax is not Supported"};
+}
+
+Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *src,
+ const ITensorInfo *dst,
+ const Attributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+ ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) ||
+ static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
+
+ // Auto initialize dst tensor info
+ TensorInfo dst_info_to_validate = *dst;
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ const size_t actual_axis =
+ static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
+ const bool needs_permute = actual_axis != 0;
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet.");
+
+ // Perform fusion test and check if the operator meets the fusion constraints
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
+}
+
+void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_LOG_PARAMS(src, dst, attributes);
+ TensorShape logits_sum_shape = src->tensor_shape();
+ ITensorInfo *logits = sketch.implementation().create_auxiliary_tensor(
+ src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+ logits_sum_shape.set(0, 1);
+ ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(
+ src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+
+ // Auto initialize dst tensor info and the auxiliary tensor infos as well
+ auto_init_if_empty(*dst, *src->clone());
+
+ // Assert validation
+ ARM_COMPUTE_ERROR_THROW_ON(GpuSoftmax::validate_op(sketch, src, dst, attributes));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(logits, sum);
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+ const auto sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+ ARM_COMPUTE_UNUSED(cl_compile_ctx);
+ ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Add Direct Conv2d Component
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments_exp_sum;
+ ArgumentPack<ITensorInfo> arguments_norm;
+
+ arguments_exp_sum.add_const_tensor(ACL_SRC_0, src);
+ arguments_exp_sum.add_const_tensor(ACL_DST_0, sum);
+ arguments_exp_sum.add_const_tensor(ACL_DST_1, logits);
+
+ arguments_norm.add_const_tensor(ACL_SRC_0, logits);
+ arguments_norm.add_const_tensor(ACL_SRC_1, sum);
+ arguments_norm.add_const_tensor(ACL_DST_0, dst);
+
+ // Add to component graph -- NOT IMPLEMENTED
+ ARM_COMPUTE_UNUSED(comp_graph, attributes);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
new file mode 100644
index 0000000000..c53453a15c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+ // Set the elementwise operation to Sub then call the elementwise common validate_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+ return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
+}
+
+Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
+
+ // Set the elementwise operation to Sub then call the elementwise common is_supported_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+ return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
+}
+
+ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
+{
+ // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
+ // Set the elementwise operation to Sub then call the elementwise common create_op
+ ElementwiseBinaryCommonAttributes common_attributes{};
+ common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub);
+ return GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, common_attributes);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
new file mode 100644
index 0000000000..b9d01966b3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ // Validate Activation Component
+ const auto properties =
+ IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC, src);
+ arguments.add_const_tensor(ACL_DST, dst_info_to_validate_ptr);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentActivation::validate(properties, arguments, act_info));
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
+{
+ return is_supported_op_helper(context, src, nullptr);
+}
+
+Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+ // Check if tensors have valid id, i.e. they are created from a sketch
+ ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ auto_init_if_empty(dst_info_to_validate, *src->clone());
+
+ // Perform fusion test to check if the operator meets fusion constraints
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC, src);
+ tensors.add_const_tensor(ACL_DST, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
+}
+
+ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_LOG_PARAMS(src);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuTanh::validate_op(sketch, src));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor
+ auto_init_if_empty(*dst, *src->clone());
+
+ // Translate into components and add to component graph
+ GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
+
+ const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
+
+ const auto *const sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ // Add Activation Component
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC, src);
+ arguments.add_const_tensor(ACL_DST, dst);
+ comp_graph.add_new_component<ClComponentActivation>(properties, arguments, act_info);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC, src);
+ tensors.add_const_tensor(ACL_DST, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
new file mode 100644
index 0000000000..d79a4c42c9
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs)
+{
+ if (dst->total_size() == 0U)
+ {
+ const std::pair<TensorShape, ValidRegion> broadcast_pair =
+ ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
+ auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first));
+ }
+}
+
+Status is_supported_op_helper(const GpuWorkloadContext &context,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *dst,
+ const ElementwiseBinaryCommonAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+
+ TensorInfo dst_info_to_validate;
+ const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
+
+ if (dst != nullptr)
+ {
+ dst_info_to_validate_ptr = dst;
+ }
+
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
+
+ // Check components
+ if (context.gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = context.cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Validate ElementwiseBinary Component
+ {
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, lhs);
+ arguments.add_const_tensor(ACL_SRC_1, rhs);
+ arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentElementwiseBinary::validate(arguments, attributes));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+
+ return Status{};
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Simple;
+} // namespace
+
+ElementwiseBinaryCommonAttributes &
+ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
+{
+ _operation = operation;
+ return *this;
+}
+
+ElementwiseBinaryCommonAttributes::ElementwiseOp ElementwiseBinaryCommonAttributes::operation() const
+{
+ return _operation;
+}
+
+Status GpuElementwiseBinaryCommon::is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ElementwiseBinaryCommonAttributes &attributes)
+{
+ return is_supported_op_helper(context, lhs, rhs, nullptr, attributes);
+}
+
+Status GpuElementwiseBinaryCommon::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ElementwiseBinaryCommonAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON(!lhs->has_valid_id() || !rhs->has_valid_id());
+
+ // Refer to GpuConv2d::validate_op() for id-validness of this TensorInfo object
+ TensorInfo dst_info_to_validate;
+
+ // Auto initialize dst tensor info
+ calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, lhs);
+ tensors.add_const_tensor(ACL_SRC_1, rhs);
+ tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Check if configuration is supported
+ return is_supported_op_helper(*sketch.gpu_context(), lhs, rhs, &dst_info_to_validate, attributes);
+}
+
+ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ const ElementwiseBinaryCommonAttributes &attributes)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs);
+ ARM_COMPUTE_LOG_PARAMS(lhs, rhs);
+ ARM_COMPUTE_ERROR_THROW_ON(GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, attributes));
+
+ ITensorInfo *dst = sketch.implementation().create_virtual_tensor();
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+
+ // Auto initialize dst tensor
+ calculate_and_init_dst_if_empty(dst, lhs, rhs);
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+
+ const auto sketch_ctx = sketch.implementation().context();
+
+ if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
+
+ // Add ElementwiseBinary Component
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, lhs);
+ arguments.add_const_tensor(ACL_SRC_1, rhs);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentElementwiseBinary>(properties, arguments, attributes);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, lhs);
+ tensors.add_const_tensor(ACL_SRC_1, rhs);
+ tensors.add_tensor(ACL_DST_0, dst);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+
+ return dst;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
new file mode 100644
index 0000000000..0b58b6eb96
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ElementwiseBinaryCommonAttributes
+{
+public:
+ enum class ElementwiseOp
+ {
+ Add, /**< (x + y) */
+ Sub, /**< (x - y) */
+ Div, /**< (x / y) */
+ Mul, /**< (x * y) */
+ Min, /**< Min(x, y) */
+ Max, /**< Max(x, y) */
+ SquaredDiff, /**< (x - y)^2 */
+ Power, /**< x ^ y */
+ Prelu, /**< y*x if x < 0, x otherwise */
+ };
+ /** Set operation*/
+ ElementwiseBinaryCommonAttributes &operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation);
+ /** Get operation*/
+ ElementwiseOp operation() const;
+
+private:
+ ElementwiseOp _operation; /**< Elementwise operation */
+};
+
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuElementwiseBinaryCommon final
+{
+public:
+ /** Create an operator and fuse it into the workload sketch.
+ * @note If @ref validate_op() fails, the creation also fails and may throw an error.
+ * @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+ *
+ * Valid data type configurations are checked at the operator level i.e. GpuAdd::validate_op(), GpuSub::validate_op(), ... etc.
+ *
+ * Valid data layouts:
+ * - Any
+ *
+ * @param[in,out] sketch Workload sketch into which the operator will be fused
+ * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+ * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+ * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc.
+ *
+ * @return Pointer for the destination tensor info
+ */
+ static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *lhs,
+ ITensorInfo *rhs,
+ const ElementwiseBinaryCommonAttributes &attributes);
+ /** Check if the operator configuration is supported, irrespective of fusion
+ *
+ * @param[in] context Workload context within which the operator is running
+ * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+ * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+ * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc.
+ *
+ * @return Status
+ */
+ static Status is_supported_op(const GpuWorkloadContext &context,
+ const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ElementwiseBinaryCommonAttributes &attributes);
+ /** Validate the operator and check if it can be fused into the workload sketch.
+ *
+ * Parameters are similar to @ref GpuElementwiseBinaryCommon::create_op()
+ *
+ * @return Status
+ */
+ static Status validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *rhs,
+ const ITensorInfo *lhs,
+ const ElementwiseBinaryCommonAttributes &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON */
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
new file mode 100644
index 0000000000..c157c2b21c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+
+#include "arm_compute/core/Error.h"
+
+#include <cstdint>
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+ return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+} // namespace
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph
+ * Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ */
+class DependencyGraph
+{
+public:
+ using Id = int32_t;
+ using TensorId = Id;
+ using OperatorId = Id;
+ /** Adjacency list
+ *
+ */
+ using AdjList = std::map<Id, std::vector<Id>>;
+
+ /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+ *
+ */
+ struct OpPack
+ {
+ OperatorId op{};
+ std::vector<TensorId> inputs{};
+ std::vector<TensorId> outputs{};
+ friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+ {
+ return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) ==
+ std::make_tuple(opp1.op, opp1.inputs, opp1.outputs);
+ }
+ };
+
+public:
+ DependencyGraph() = default;
+ friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &);
+
+ /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list
+ *
+ * Rule: If the new operator is not the first operator, at least one input tensor must be
+ * the output tensor of the last non-output operator. All other input tensors must be
+ * the global input of the graph (i.e. not the output of any operator).
+ *
+ * Rule: The output tensor of the new operator must not be the input tensor of any previously
+ * added operator.
+ *
+ * PRECONDITION: The current graph is already linear
+ *
+ * @return true If the operator can be added while keeping the graph as a linear sequence
+ * @return false Otherwise
+ */
+ bool try_add_operator_as_linear(OperatorId op,
+ const std::vector<TensorId> &inputs,
+ const std::vector<TensorId> &outputs,
+ bool is_output = false) const
+ {
+ ARM_COMPUTE_UNUSED(op, is_output);
+ if (all_ops().empty())
+ {
+ return true;
+ }
+
+ // If the new operator is not the first operator, at least one input tensor must be
+ // the output tensor of the last non-output operator. All other input tensors must be
+ // the global input of the graph (i.e. not the output of any operator).
+ if (_last_op_available)
+ {
+ auto use_input_from_last_op = false;
+
+ for (auto src_tensor : inputs)
+ {
+ const auto src_ops = _adj_src_ops.find(src_tensor);
+
+ if (src_ops != _adj_src_ops.end())
+ {
+ ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1);
+
+ if (!src_ops->second.empty())
+ {
+ const auto src_op = src_ops->second[0];
+
+ if (src_op == _last_op)
+ {
+ if (use_input_from_last_op)
+ {
+ // To be safe, we also forbid using the output tensor
+ // of the last operator twice.
+ return false;
+ }
+
+ use_input_from_last_op = true;
+ }
+ else
+ {
+ // The input tensor of this operator must not be the output tensor
+ // of any other operator except the last non-output operator.
+ return false;
+ }
+ }
+ }
+ }
+
+ if (!use_input_from_last_op)
+ {
+ // At least one input tensor must be the output tensor of the last non-output operator.
+ return false;
+ }
+ }
+
+ // The output tensor of the new operator must not be the input tensor of any previously
+ // added operator.
+ for (auto dst_tensor : outputs)
+ {
+ if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
+ {
+ return false;
+ }
+ }
+
+ return true;
+ }
+ /** Add an operator, while keeping the graph as a "linear sequence"
+ *
+ * PRECONDITION: The current graph is already linear
+ * INVARIANT: The list can only grow from head to tail
+ * INVARIANT: POSTCONDITION: The graph is linear
+ */
+ void add_operator_as_linear(OperatorId op,
+ const std::vector<TensorId> &inputs,
+ const std::vector<TensorId> &outputs,
+ bool is_output = false)
+ {
+ const auto success = add_operator(op, inputs, outputs, is_output);
+ ARM_COMPUTE_UNUSED(success);
+ ARM_COMPUTE_ERROR_ON(!success);
+ }
+ /** Add a new operator
+ * Return invalid if it violates the DAG invariant
+ * Invalid operation will not change the graph
+ *
+ * @param[in] op Operator to add
+ * @param[in] inputs Input tensors to the operator
+ * @param[in] outputs Output tensors to the operator
+ * @param[in] is_output Whether this is an output operator
+ */
+ bool add_operator(OperatorId op,
+ const std::vector<TensorId> &inputs,
+ const std::vector<TensorId> &outputs,
+ bool is_output = false)
+ {
+ if (operator_exists(op))
+ {
+ return false;
+ }
+ _adj_src_tensors[op] = {};
+ _adj_dst_tensors[op] = {};
+ for (auto in_tensor : inputs)
+ {
+ // Linking input tensor to operator node will never create a cycle / loop because we guarantee
+ // each op is newly created, so every <input, op> pair / edge is new
+ link_input(op, in_tensor);
+ }
+ for (auto out_tensor : outputs)
+ {
+ // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
+ if (path_exists_from_tensor_to_op(out_tensor, op))
+ {
+ remove_operator(op);
+ return false;
+ }
+ else
+ {
+ link_output(op, out_tensor);
+ }
+ }
+
+ if (!is_output)
+ {
+ _last_op_available = true;
+ _last_op = op;
+ }
+
+ return true;
+ }
+
+ /** Build a sequence of operators from the acyclic graph of operators.
+ *
+ * The graph will be visited in depth-first strategy. The operator can only be added to
+ * the sequence when all operators that supply the input tensors have been added. Otherwise,
+ * the operator will be ignored and later visited again. In other words, the dependency between
+ * operators will be preserved in the sequence.
+ */
+ std::vector<OpPack> build_operators_sequence() const
+ {
+ std::vector<OpPack> ops_seq;
+ std::set<Id> done_ops;
+ std::set<Id> done_tensors;
+
+ const auto input_tensors = global_src_tensors();
+
+ for (auto tensor : input_tensors)
+ {
+ done_tensors.insert(tensor);
+
+ for (auto op : _adj_dst_ops.at(tensor))
+ {
+ build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors);
+ }
+ }
+
+ return ops_seq;
+ }
+
+ /** Strict equality comparison (all internal ids and order of insertion matter).
+ * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+ *
+ *
+ * @param[in] g0
+ * @param[in] g1
+ * @return true If the same
+ * @return false Otherwise
+ */
+ friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+ {
+ // Do not compare id allocators
+ return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) ==
+ std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+ }
+ std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
+ {
+ return _adj_src_ops.at(tensor);
+ }
+ std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const
+ {
+ return _adj_dst_ops.at(tensor);
+ }
+ /** Get all tensors
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> all_tensors() const
+ {
+ std::vector<TensorId> tensors{};
+ std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors),
+ [](const auto &it) { return it.first; });
+ return tensors;
+ }
+ /** Get source tensors of the whole graph
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> global_src_tensors() const
+ {
+ std::vector<TensorId> tensors;
+ for (auto tensor_src_ops : _adj_src_ops)
+ {
+ if (tensor_src_ops.second.empty())
+ {
+ tensors.push_back(tensor_src_ops.first);
+ }
+ }
+ return tensors;
+ }
+ /** Get destination tensors of the whole graph
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> global_dst_tensors() const
+ {
+ std::vector<TensorId> tensors;
+ for (auto tensor_dst_ops : _adj_dst_ops)
+ {
+ if (tensor_dst_ops.second.empty())
+ {
+ tensors.push_back(tensor_dst_ops.first);
+ }
+ }
+ return tensors;
+ }
+ /** Get intermediate tensors of the whole graph.
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> intermediate_tensors() const
+ {
+ std::vector<TensorId> tensors;
+
+ // If a tensor is used to connect the input of an operator and the output of another operator,
+ // it is not allocated in the memory. The tensor exists as a temporary variable only.
+ for (auto src_tensor : _adj_src_ops)
+ {
+ if (!src_tensor.second.empty())
+ {
+ const auto dst_tensor = _adj_dst_ops.find(src_tensor.first);
+ if (dst_tensor != _adj_dst_ops.end())
+ {
+ if (!dst_tensor->second.empty())
+ {
+ tensors.push_back(src_tensor.first);
+ }
+ }
+ }
+ }
+
+ return tensors;
+ }
+ /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+ *
+ * @return std::vector<OperatorId>
+ */
+ std::vector<OperatorId> get_root_ops() const
+ {
+ std::vector<OperatorId> ops{};
+ const auto op_list = all_ops();
+
+ for (auto op : op_list)
+ {
+ if (src_ops(op).empty())
+ {
+ ops.emplace_back(op);
+ }
+ }
+ return ops;
+ }
+
+private:
+ void link_input(OperatorId op, TensorId in_tensor)
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ if (!tensor_exists(in_tensor))
+ {
+ insert_new_tensor(in_tensor);
+ }
+ ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking
+ _adj_src_tensors[op].push_back(in_tensor);
+ _adj_dst_ops[in_tensor].push_back(op);
+ }
+ void link_output(OperatorId op, TensorId out_tensor)
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ if (!tensor_exists(out_tensor))
+ {
+ insert_new_tensor(out_tensor);
+ }
+ ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking
+ _adj_dst_tensors[op].push_back(out_tensor);
+ _adj_src_ops[out_tensor].push_back(op);
+ }
+
+ std::vector<OperatorId> src_ops(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ std::vector<OperatorId> ops{};
+ for (TensorId src_tensor : src_tensors(op))
+ {
+ ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+ }
+ return ops;
+ }
+ std::vector<OperatorId> dst_ops(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ std::vector<OperatorId> ops{};
+ for (TensorId dst_tensor : _adj_dst_tensors.at(op))
+ {
+ ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+ }
+ return ops;
+ }
+
+ /** Get source tensors to an operator
+ *
+ * @param[in] op
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> src_tensors(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ return _adj_src_tensors.at(op);
+ }
+ /** Get destination tensors to an operator
+ *
+ * @param[in] op
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> dst_tensors(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ return _adj_dst_tensors.at(op);
+ }
+ /** Get all operators
+ *
+ * @return std::vector<OperatorId>
+ */
+ std::vector<OperatorId> all_ops() const
+ {
+ std::vector<OperatorId> ops{};
+ std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops),
+ [](const auto &it) { return it.first; });
+ return ops;
+ }
+ /** Remove an operator from graph.
+ *
+ * @param[in] op
+ */
+ void remove_operator(OperatorId op)
+ {
+ for (auto src_tensor : _adj_src_tensors.at(op))
+ {
+ auto &dst_ops = _adj_dst_ops.at(src_tensor);
+ dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops));
+ }
+ for (auto dst_tensor : _adj_dst_tensors.at(op))
+ {
+ auto &src_ops = _adj_src_ops.at(dst_tensor);
+ src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops));
+ }
+ // Remove any isolated tensors
+ // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
+ for (auto t : all_tensors())
+ {
+ if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+ {
+ _adj_src_ops.erase(t);
+ _adj_dst_ops.erase(t);
+ }
+ }
+ _adj_src_tensors.erase(op);
+ _adj_dst_tensors.erase(op);
+ }
+ void insert_new_tensor(TensorId tensor)
+ {
+ _adj_src_ops[tensor] = {};
+ _adj_dst_ops[tensor] = {};
+ }
+ bool tensor_exists(TensorId tensor) const
+ {
+ return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+ }
+ bool operator_exists(OperatorId op) const
+ {
+ return _adj_src_tensors.find(op) != _adj_src_tensors.end() &&
+ _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+ }
+ bool is_src_tensor_of(OperatorId op, TensorId tensor) const
+ {
+ if (!operator_exists(op) || !tensor_exists(tensor))
+ {
+ return false;
+ }
+ const auto op_inputs = src_tensors(op);
+ return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+ }
+ bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
+ {
+ if (!operator_exists(op) || !tensor_exists(tensor))
+ {
+ return false;
+ }
+ const auto op_outputs = dst_tensors(op);
+ return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+ }
+ bool are_connected(OperatorId op, TensorId tensor) const
+ {
+ return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+ }
+ /** If op is the destination / leaf operator of the whole graph
+ *
+ * @param[in] op
+ * @return true
+ * @return false
+ */
+ bool is_dst_op(OperatorId op) const
+ {
+ return dst_ops(op).empty();
+ }
+ std::vector<OperatorId> get_dst_ops() const
+ {
+ std::vector<OperatorId> ops{};
+ const auto op_list = all_ops();
+
+ for (auto op : op_list)
+ {
+ if (is_dst_op(op))
+ {
+ ops.emplace_back(op);
+ }
+ }
+ return ops;
+ }
+ bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
+ {
+ if (!tensor_exists(src_tensor) || !operator_exists(dst_op))
+ {
+ return false;
+ }
+ for (auto child_op : dst_ops_from_tensor(src_tensor))
+ {
+ if (path_exists_from_op_to_op(child_op, dst_op))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
+ {
+ if (!operator_exists(src_op) || !operator_exists(dst_op))
+ {
+ return false;
+ }
+ if (src_op == dst_op)
+ {
+ return true;
+ }
+ if (is_in(src_op, get_dst_ops()))
+ {
+ return false;
+ }
+ for (auto child_tensor : dst_tensors(src_op))
+ {
+ if (path_exists_from_tensor_to_op(child_tensor, dst_op))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void build_operators_sequence_from_op(Id op,
+ std::vector<OpPack> &ops_seq,
+ std::set<Id> &done_ops,
+ std::set<Id> &done_tensors) const
+ {
+ while (true)
+ {
+ // If the operator has been added to the sequence, ignore it.
+ if (done_ops.find(op) != done_ops.end())
+ {
+ return;
+ }
+
+ // If not all the input tensors of the operator are available, this operator cannot be
+ // added to the sequence for now. It will be visited again after the source operator
+ // is added to the sequence.
+ const auto src_tensors = _adj_src_tensors.at(op);
+
+ for (auto src : src_tensors)
+ {
+ if (done_tensors.find(src) == done_tensors.end())
+ {
+ return;
+ }
+ }
+
+ // This operator is ready to be added to the sequence.
+ const auto dst_tensors = _adj_dst_tensors.at(op);
+
+ done_ops.insert(op);
+
+ OpPack pack{op, src_tensors, dst_tensors};
+ ops_seq.push_back(pack);
+
+ done_tensors.insert(dst_tensors.begin(), dst_tensors.end());
+
+ // Visit all the sink operators.
+ // Call this function recursively unless there is only one sink.
+ if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
+ {
+ op = _adj_dst_ops.at(dst_tensors[0])[0];
+ }
+ else
+ {
+ for (auto dst_tensor : dst_tensors)
+ {
+ const auto dst_ops = _adj_dst_ops.at(dst_tensor);
+
+ for (auto dst_op : dst_ops)
+ {
+ build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors);
+ }
+ }
+
+ return;
+ }
+ }
+ }
+
+private:
+ AdjList _adj_src_tensors{};
+ AdjList _adj_dst_tensors{};
+ AdjList _adj_src_ops{};
+ AdjList _adj_dst_ops{};
+
+ bool _last_op_available{false};
+ OperatorId _last_op{0};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */
diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h
new file mode 100644
index 0000000000..3f4a2edd03
--- /dev/null
+++ b/src/dynamic_fusion/utils/Utils.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_UTILS_UTILS
+#define SRC_DYNAMIC_FUSION_UTILS_UTILS
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Tensor should have backing memory. @ref MemoryType
+ */
+inline bool is_alloc_tensor(const ITensorInfo *tensor_info)
+{
+ return tensor_info->id() > ITensorInfo::invalid_tensor_id;
+}
+
+/** Tensor should not have backing memory. @ref MemoryType
+ */
+inline bool is_noalloc_tensor(const ITensorInfo *tensor_info)
+{
+ return tensor_info->id() < ITensorInfo::invalid_tensor_id;
+}
+
+/** @ref ITensorInfo has valid id
+ */
+inline bool is_valid_tensor(const ITensorInfo *tensor_info)
+{
+ return tensor_info->has_valid_id();
+}
+
+/** @ref ITensorInfo has invalid id
+ */
+inline bool is_invalid_tensor(const ITensorInfo *tensor_info)
+{
+ return !is_valid_tensor(tensor_info);
+}
+
+/** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo
+*/
+inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr,
+ bool mixed_precision = false,
+ DataLayout data_layout = DataLayout::NHWC)
+{
+ // Create PadStrideInfo
+ const Size2D stride = pool_attr.stride();
+ const Padding2D padding = pool_attr.pad();
+ const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top,
+ arm_compute::DimensionRoundingType::FLOOR);
+
+ return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride,
+ pool_attr.exclude_padding(), mixed_precision);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */