aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Android.bp18
-rw-r--r--arm_compute/core/CL/CLCompileContext.h6
-rw-r--r--arm_compute/core/ITensorInfo.h22
-rw-r--r--arm_compute/core/SubTensorInfo.h13
-rw-r--r--arm_compute/core/TensorInfo.h15
-rw-r--r--arm_compute/core/Types.h16
-rw-r--r--arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h80
-rw-r--r--arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h76
-rw-r--r--arm_compute/dynamic_fusion/sketch/OperatorAttributes.h65
-rw-r--r--arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h87
-rw-r--r--arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h100
-rw-r--r--arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h84
-rw-r--r--examples/SConscript9
-rw-r--r--filelist.json31
-rw-r--r--src/core/CL/CLCompileContext.cpp6
-rw-r--r--src/core/CL/ICLKernel.h1
-rw-r--r--src/core/TensorInfo.cpp24
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp200
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h76
-rw-r--r--src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp351
-rw-r--r--src/dynamic_fusion/sketch/ArgumentPack.h242
-rw-r--r--src/dynamic_fusion/sketch/OperatorAttributes.cpp63
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuComponentServices.h54
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp37
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h128
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp125
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h104
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp291
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h143
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp69
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h88
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h126
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp87
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h77
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp172
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h111
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h54
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp55
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp76
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h111
-rw-r--r--src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h252
-rw-r--r--src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h66
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h64
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h119
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/Types.h52
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp152
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h147
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp57
-rw-r--r--src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h102
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp255
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp109
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h135
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h137
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp400
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h113
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp113
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h85
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp297
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h92
-rw-r--r--src/dynamic_fusion/sketch/utils/DependencyGraph.h658
-rw-r--r--tests/SConscript15
-rw-r--r--tests/validation/dynamic_fusion/gpu/Integration.cpp201
62 files changed, 7068 insertions, 46 deletions
diff --git a/Android.bp b/Android.bp
index 8c6d700062..d02d436fa0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -592,6 +592,24 @@ cc_library_static {
"src/cpu/operators/CpuTranspose.cpp",
"src/cpu/operators/CpuWinogradConv2d.cpp",
"src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp",
+ "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+ "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
+ "src/dynamic_fusion/sketch/OperatorAttributes.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+ "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+ "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+ "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
"src/gpu/cl/ClContext.cpp",
"src/gpu/cl/ClKernelLibrary.cpp",
"src/gpu/cl/ClQueue.cpp",
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index e8f2ff35da..60e0f95f83 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -310,6 +310,12 @@ public:
*/
int32_t get_ddk_version() const;
+ /** Return the Gpu target of the associated device
+ *
+ * @return GPUTarget
+ */
+ GPUTarget get_gpu_target() const;
+
private:
/** Load program and its dependencies.
*
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index 6839d697e3..ca2837e450 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,6 +41,11 @@ class ITensorInfo : public misc::ICloneable<ITensorInfo>
{
public:
using TensorDimsState = std::vector<int>;
+ /** An id that uniquely identifies an ITensorInfo within some domain (e.g. a workload)
+ */
+ using Id = int32_t;
+ /** An invalid tensor id within a domain */
+ static constexpr Id invalid_tensor_id = -1;
/** Get the value representing dynamic dimension state
*
* @return Value representing dynamic dimension state
@@ -280,7 +285,20 @@ public:
* @return A DataLayout containing the layout data information.
*/
virtual DataLayout data_layout() const = 0;
-
+ /** Get the workload tensor id of the tensor.
+ *
+ * @return Workload tensor id of the tensor
+ */
+ virtual Id id() const = 0;
+ /** Set the tensor id
+ */
+ virtual ITensorInfo &set_id(ITensorInfo::Id id) = 0;
+ /** Check if the tensor id is valid
+ */
+ bool has_valid_id() const
+ {
+ return id() != invalid_tensor_id;
+ }
/** If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of
* the broadcasted valid regions of the tensors.
*
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index 54836d0528..374ea5b8c6 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -237,6 +237,17 @@ public:
ARM_COMPUTE_ERROR_ON(_parent == nullptr);
return _parent->data_layout();
}
+ ITensorInfo::Id id() const override
+ {
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ return _parent->id();
+ }
+ ITensorInfo &set_id(ITensorInfo::Id id) override
+ {
+ ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+ _parent->set_id(id);
+ return *this;
+ }
private:
ITensorInfo *_parent;
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 40f9ed9806..7eb8c52d07 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -50,7 +50,7 @@ public:
/** Allow instances of this class to be copy constructed */
TensorInfo(const ITensorInfo &info);
/** Allow instances of this class to be copy constructed */
- TensorInfo(const TensorInfo &) = default;
+ TensorInfo(const TensorInfo &);
/** Allow instances of this class to be copied */
TensorInfo &operator=(const TensorInfo &) = default;
/** Allow instances of this class to be move constructed */
@@ -297,6 +297,15 @@ public:
_are_values_constant = are_values_constant;
return *this;
}
+ ITensorInfo::Id id() const override
+ {
+ return _id;
+ }
+ ITensorInfo &set_id(ITensorInfo::Id id) override
+ {
+ _id = id;
+ return *this;
+ }
inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
private:
@@ -320,6 +329,7 @@ private:
QuantizationInfo _quantization_info;
DataLayout _data_layout;
bool _are_values_constant;
+ ITensorInfo::Id _id;
};
/** Check whether two tensor info are equal.
@@ -334,7 +344,8 @@ inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
&& (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
&& (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
- && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant);
+ && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant)
+ && (lhs._id == rhs._id);
}
} // namespace arm_compute
#endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index fc6d46c53b..b0a6475527 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -782,6 +782,20 @@ private:
DimensionRoundingType _round_type;
};
+/** Padding information for 2D operations like Conv2d */
+struct Padding2D
+{
+ Padding2D() = default;
+ Padding2D(size_t left, size_t right, size_t top, size_t bottom)
+ : left(left), right(right), top(top), bottom(bottom)
+ {
+ }
+ size_t left = { 0 }; /**< Padding across the width dimension on the left, in elements. */
+ size_t right = { 0 }; /**< Padding across the width dimension on the right, in elements. */
+ size_t top = { 0 }; /**< Padding across the height dimension on the top, in elements. */
+ size_t bottom = { 0 }; /**< Padding across the height dimension on the bottom, in elements. */
+};
+
/** Padding information for 3D operations like Conv3d */
struct Padding3D
{
@@ -1642,7 +1656,7 @@ public:
LINEAR, /**< Linear ( \f$ f(x)= ax + b \f$ ) */
IDENTITY, /**< Identity ( \f$ f(x)= x \f$ ) */
HARD_SWISH, /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
- SWISH, /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
+ SWISH, /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
GELU /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
};
diff --git a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
new file mode 100644
index 0000000000..326880f721
--- /dev/null
+++ b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME
+#define ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLTensor;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** OpenCL runtime to run a workload
+ */
+class ClWorkloadRuntime
+{
+public:
+ ClWorkloadRuntime();
+ ~ClWorkloadRuntime();
+ /** Configure @ref ClWorkloadRuntime
+ * @note A runtime cannot be re-configured
+ *
+ * @param[in] sketch @ref GpuWorkloadSketch with which to configure
+ */
+ Status configure(const GpuWorkloadSketch &sketch);
+ /** Perform run workload
+ * @note If the runtime is not configured, this method will not perform any action
+ *
+ * @param[in,out] tensors Tensors required by the run workloads
+ *
+ * @return Status If the run is successful
+ */
+ Status run(const std::vector<CLTensor *> &tensors);
+ /** Get auxiliary tensors of the workload and their memory requirement
+ */
+ std::vector<std::pair<CLTensor *, AuxMemoryInfo>> get_auxiliary_tensors();
+
+private:
+ /** Enqueue prepare workload
+ * @note If the runtime is not configured, this method will not perform any action
+ */
+ void prepare();
+ struct Implementation;
+ std::unique_ptr<Implementation> _impl;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME */
diff --git a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
new file mode 100644
index 0000000000..deedf62262
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Type of memory used by a workload tensor */
+enum class MemoryType
+{
+ User = 0, /**< Memory coming directly from users, e.g. for argument tensors */
+ Auxiliary = 1, /**< Additional memory required by the workload tensor, e.g. for temporary tensors */
+};
+
+/** Memory information for tensors with @ref MemoryType::Auxiliary.
+ * This informs how much additional memory is required for auxiliary tensors
+ */
+struct AuxMemoryInfo
+{
+ AuxMemoryInfo() = default;
+
+ AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
+ : size(size),
+ alignment(alignment)
+ {
+ }
+
+ friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
+ {
+ return info0.size == info1.size && info0.alignment == info1.alignment;
+ }
+ size_t size{ 0 }; /**< Total memory size in bytes */
+ size_t alignment{ 0 }; /**< Memory alignment in bytes */
+};
+
+/** Descriptor of a workload tensor memory */
+struct MemoryDescriptor
+{
+ MemoryType memory_type{}; /**< Memory Type*/
+ AuxMemoryInfo aux_memory_info{}; /**< Auxiliary Tensor Memory Information */
+};
+
+/** A map from @ref ITensorInfo to their corresponding @ref MemoryDescriptor */
+using MemoryDescriptorMap = std::map<ITensorInfo::Id, MemoryDescriptor>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR */
diff --git a/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h b/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h
new file mode 100644
index 0000000000..22c6772926
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/OperatorAttributes.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** Conv2d attributes */
+class Conv2dAttributes
+{
+public:
+ /** Set padding */
+ Conv2dAttributes &pad(const Padding2D &pad);
+ /** Get padding */
+ Padding2D pad() const;
+ /** Set stride */
+ Conv2dAttributes &stride(const Size2D &stride);
+ /** Get stride */
+ Size2D stride() const;
+ /** Set dilation */
+ Conv2dAttributes &dilation(const Size2D &dilation);
+ /** Get dilation */
+ Size2D dilation() const;
+
+private:
+ Padding2D _pad{}; /**< Padding */
+ Size2D _stride{ 1U, 1U }; /**< Stride */
+ Size2D _dilation{ 1U, 1U }; /**< Dilation */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_OPERATORATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
new file mode 100644
index 0000000000..1ee3c7e3ec
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT
+
+#include "arm_compute/core/GPUTarget.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLCompileContext;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Gpu Information such as the Gpu target (for example, G76) */
+using GpuTarget = ::arm_compute::GPUTarget;
+
+/** Gpu Language */
+enum class GpuLanguage
+{
+ OpenCL,
+ Unknown
+};
+/** Provide context necessary for the creation and configuration of a workload
+ * e.g. gpu targets and capabilities, cl::Device for querying OpenCl extensions. Both can affect how a kernel is generated
+ *
+ * This context is shared between different operators within a sketch, and has to stay valid for the entire workload creation session.
+ * This context may also be shared between different sketches.
+ *
+ * This class only contains information for workload creation, but not for runtime (e.g. cl::Queue for enqueueing the kernels)
+ */
+class GpuWorkloadContext
+{
+public:
+ /** Constructor */
+ GpuWorkloadContext(CLCompileContext *cl_compile_context);
+ /** Allow instances of this class to be copy constructed */
+ GpuWorkloadContext(const GpuWorkloadContext &config) = default;
+ /** Allow instances of this class to be copied */
+ GpuWorkloadContext &operator=(const GpuWorkloadContext &config) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuWorkloadContext(GpuWorkloadContext &&config) = default;
+ /** Allow instances of this class to be moved */
+ GpuWorkloadContext &operator=(GpuWorkloadContext &&config) = default;
+ /** Get @ref GpuLanguage of the context */
+ GpuLanguage gpu_language() const;
+ /** Get @ref GpuTarget of the context */
+ GpuTarget gpu_target() const;
+ /** Get @ref CLCompileContext
+ * If the gpu language is not OpenCL, then return nullptr
+ */
+ const CLCompileContext *cl_compile_context() const;
+
+private:
+ GpuLanguage _gpu_language{ GpuLanguage::Unknown };
+ CLCompileContext *_cl_compile_ctx{ nullptr };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
new file mode 100644
index 0000000000..afbe2b8d0b
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A descriptor of a workload of operators
+ *
+ * A "workload" is a basic unit of computation to schedule and perform. It contains one or more operators that can be "fused" together.
+ * Note that a workload may still contain multiple kernels.
+ */
+class GpuWorkloadSketch
+{
+public:
+ /** Global context used for the creation of a workload */
+ using Context = GpuWorkloadContext;
+ /** Internal opaque implementation */
+ class Implementation;
+
+public:
+ /** Constructor
+ *
+ * @param[in] context Gpu context for the creation of a workload
+ */
+ explicit GpuWorkloadSketch(GpuWorkloadContext *context);
+ /** Destructor */
+ ~GpuWorkloadSketch();
+ /** Get the implementation */
+ Implementation &implementation();
+ /** Get the implementation */
+ const Implementation &implementation() const;
+ /** Get the gpu workload context of this sketch */
+ const GpuWorkloadContext *gpu_context() const;
+ /** Create a @ref TensorInfo associated with the workload sketch.
+ *
+ * @return TensorInfo Newly created tensor info
+ */
+ template <typename... Args>
+ TensorInfo create_tensor_info(Args &&... args)
+ {
+ auto tensor_info = TensorInfo(std::forward<Args>(args)...);
+ tensor_info.set_id(allocate_new_tensor_id());
+ return tensor_info;
+ }
+ /** Create a @ref TensorInfo associated with the workload sketch by copying from an existing tensor info
+ * @note The newly copied tensor will have a different identity within the workload than the one copied from
+ * To copy the identity of @p tensor_info as well, use @ref TensorInfo 's copy constructors instead
+ *
+ * @param[in] tensor_info @ref ITensorInfo to copy from
+ *
+ * @return TensorInfo Newly created tensor info
+ */
+ TensorInfo create_tensor_info(const ITensorInfo &tensor_info);
+ /** Create a default @ref TensorInfo associated with the workload sketch
+ * It is usually used by a destination tensor whose @ref ITensorInfo is to be inferred automatically
+ *
+ * @return TensorInfo Newly created tensor info
+ */
+ TensorInfo create_tensor_info();
+
+private:
+ ITensorInfo::Id allocate_new_tensor_id();
+ std::unique_ptr<Implementation> _impl; /**< Internal opaque implementation*/
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
new file mode 100644
index 0000000000..fe9108d356
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuConv2d final
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+ using Attributes = Conv2dAttributes;
+ /** Create an operator and fuse it into the workload sketch.
+ * @note If @ref validate_op() fails, the creation also fails and may throw an error.
+ * @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+ *
+ * Valid data type configurations:
+ * |src |wei |bia |dst |
+ * |:--------------|:--------------|:--------------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * @param[in,out] sketch Workload sketch into which the operator will be fused
+ * @param[in] src Source tensor
+ * @param[in] wei Weight tensor
+ * @param[in] bia (Optional) Bias tensor
+ * @param[out] dst Destination tensor
+ * @param[in] attributes Operator attributes
+ */
+ static void create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *src,
+ ITensorInfo *wei,
+ ITensorInfo *bia,
+ ITensorInfo *dst,
+ const Attributes &attributes);
+ /** Validate the operator and check if it can be fused into the workload sketch.
+ * Similar to @ref GpuConv2d::create_op()
+ */
+ static Status validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const ITensorInfo *dst,
+ const Attributes &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D */
diff --git a/examples/SConscript b/examples/SConscript
index 7d9324ca9a..fd6b591891 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -99,15 +99,6 @@ if env['opencl']:
prog = install_bin(prog)
alias = examples_env.Alias(example, prog)
Default(alias)
- if env['experimental_dynamic_fusion']:
- examples_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
- for file in Glob("./dynamic_fusion/*.cpp"):
- example = os.path.basename(os.path.splitext(str(file))[0])
- prog = examples_env.Program(example, ["./dynamic_fusion/{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs)
- Depends(prog, arm_compute_dependency)
- prog = install_bin(prog)
- alias = examples_env.Alias(example, prog)
- Default(alias)
if env['gemm_tuner'] and env['opencl']:
gemm_tuner_common_options = examples_env.Object("./gemm_tuner/CommonGemmExampleOptions.cpp")
diff --git a/filelist.json b/filelist.json
index 431979ec41..4603932638 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2115,20 +2115,25 @@
},
"experimental": {
"dynamic_fusion": [
- "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
- "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp",
+ "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp",
+ "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp",
- "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp",
- "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp"
+ "src/dynamic_fusion/sketch/OperatorAttributes.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp",
+ "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp",
+ "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp",
+ "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp",
+ "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp",
+ "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp"
]
}
}
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index fce8798b48..ea03d59fc2 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -232,7 +232,7 @@ void CLCompileContext::set_context(cl::Context context)
std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
{
std::string concat_str;
- bool ext_supported = false;
+ bool ext_supported = false;
std::string ext_buildopts;
#if defined(ARM_COMPUTE_DEBUG_ENABLED)
@@ -399,4 +399,8 @@ int32_t CLCompileContext::get_ddk_version() const
return -1;
}
+GPUTarget CLCompileContext::get_gpu_target() const
+{
+ return _device.target();
+}
} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index d52b105507..224b68af70 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -44,7 +44,6 @@ namespace experimental
{
namespace dynamic_fusion
{
-struct TensorBinding;
struct ClExecutionDescriptor;
} // namespace dynamic_fusion
} // namespace experimental
diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index e441ddb3a2..12f79444c6 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@ namespace arm_compute
{
TensorInfo::TensorInfo()
: _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
- _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true)
+ _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id)
{
}
@@ -56,8 +56,28 @@ TensorInfo::TensorInfo(const ITensorInfo &info)
_quantization_info = info.quantization_info();
_data_layout = info.data_layout();
_are_values_constant = info.are_values_constant();
+ _id = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied
}
+TensorInfo::TensorInfo(const TensorInfo &info)
+ : TensorInfo()
+{
+ _total_size = info.total_size();
+ _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
+ _strides_in_bytes = info.strides_in_bytes();
+ _num_channels = info.num_channels();
+ _tensor_shape = info.tensor_shape();
+ _dims_state = info.tensor_dims_state();
+ _data_type = info.data_type();
+ _format = info.format();
+ _is_resizable = info.is_resizable();
+ _valid_region = info.valid_region();
+ _padding = info.padding();
+ _quantization_info = info.quantization_info();
+ _data_layout = info.data_layout();
+ _are_values_constant = info.are_values_constant();
+ _id = invalid_tensor_id; // Tensor Id has to be explicitly set, instead of being copied
+}
TensorInfo::TensorInfo(Format format)
: TensorInfo(TensorShape(), format)
{
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
new file mode 100644
index 0000000000..93fbdfed63
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClKernelRuntime.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClKernelLibrary.h"
+
+#include "support/Cast.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using namespace arm_compute::opencl;
+
+void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKernelSourceCode &code)
+{
+ // Create kernel from kernel source string
+ opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
+ _kernel = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
+ "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */,
+ code.code(),
+ klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+ code.build_options().options(),
+ false /* Is source binary */));
+
+ // Configure execution window
+ IClKernel::configure_internal(code.window());
+
+ // Set config id for lws tuning
+ _config_id = code.config_id();
+
+ // Set kernel arguments
+ _arguments = code.arguments();
+}
+
+inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
+{
+ switch(arg.type)
+ {
+ case GpuKernelArgumentInfo::Type::Scalar:
+ {
+ ARM_COMPUTE_ERROR("Unsupported yet");
+ break;
+ }
+
+ case GpuKernelArgumentInfo::Type::Vector:
+ {
+ add_1D_tensor_argument(idx, tensor, arg_slice);
+ break;
+ }
+
+ case GpuKernelArgumentInfo::Type::Image:
+ {
+ add_2D_tensor_argument(idx, tensor, arg_slice);
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Image_Reinterpret_As_3D:
+ {
+ add_2D_tensor_argument(idx, tensor, arg_slice);
+ const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+ {
+ const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+ const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
+ cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+ cl_images.push_back(tensor_image2d);
+ _kernel.setArg(idx++, tensor_image2d);
+ break;
+ }
+
+ case GpuKernelArgumentInfo::Type::Image_3D:
+ {
+ add_2D_tensor_argument(idx, tensor, arg_slice);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+ {
+ const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+ const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
+ cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+ cl_images.push_back(tensor_image2d);
+ _kernel.setArg(idx++, tensor_image2d);
+ _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
+ break;
+ }
+
+ case GpuKernelArgumentInfo::Type::Tensor_3D:
+ {
+ add_3D_tensor_argument(idx, tensor, arg_slice);
+ break;
+ }
+
+ case GpuKernelArgumentInfo::Type::Tensor_4D:
+ {
+ add_4D_tensor_argument(idx, tensor, arg_slice);
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+ {
+ add_4d_tensor_nhwc_argument(idx, tensor);
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+ {
+ const size_t image_w = tensor->info()->dimension(0) / 4;
+ const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1);
+ const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
+
+ cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
+ TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+ cl_images.push_back(tensor_image2d);
+
+ _kernel.setArg(idx++, tensor_image2d);
+ add_4d_tensor_nhwc_argument(idx, tensor);
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported");
+ }
+ }
+}
+
+void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+ // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+ Window slice_fixed_z = slice;
+ slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
+ slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ /// NOTE: Parameters extracted from old kernels. So far they seem to be constant
+ /// but we may need to make them into another configuration passed from GpuWorkloadSourceCode if needed in the future
+ constexpr bool slide_along_dimz = true;
+ constexpr bool skip_sliding_window = false;
+ constexpr bool use_dummy_work_items = false;
+
+ unsigned int idx = 0;
+ do
+ {
+ // Set kernel arguments
+ Window arg_slice = slice;
+ // CLImages created from tensor arguments. Need to be retained until enqueue
+ std::vector<cl::Image2D> cl_images;
+ for(auto id_arg : _arguments)
+ {
+ const auto arg = id_arg.second;
+ auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
+ if(!slide_along_dimz)
+ {
+ // The stride_z for matrix must be zero if we do not slice
+ ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
+ arg_slice = slice_fixed_z;
+ }
+ add_tensor_argument(idx, *arg.kernel_argument_info(), tensor, arg_slice, cl_images);
+ }
+
+ // Dispatch kernel
+ enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
+ }
+ while(skip_sliding_window && window.slide_window_slice_3D(slice));
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
new file mode 100644
index 0000000000..acc2380031
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+#define SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct GpuKernelSourceCode;
+
+/** OpenCL runtime to run a single kernel */
+class ClKernelRuntime final : public opencl::IClKernel
+{
+public:
+ /** Configure the kernel runtime
+ *
+ * @param[in] compile_ctx OpenCL compile context
+ * @param[in] code Kernel source code
+ */
+ void configure(const opencl::ClCompileContext &compile_ctx, const GpuKernelSourceCode &code);
+ /** Run the kernel
+ *
+ * @param[in,out] tensors @ref ITensorPack object containing run-time tensor memories
+ * @param[in] window Execution window
+ * @param[in] queue OpenCL command queue
+ */
+ virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ /** Set a kernel tensor argument
+ *
+ * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+ * @param[in] arg Kernel argument descriptor accompanying @p tensor
+ * @param[in] tensor Tensor to set as an argument of the object's kernel
+ * @param[in] arg_slice Window the kernel will be run on
+ * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+ */
+ inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+
+private:
+ GpuKernelArgumentList _arguments{}; /** All kernel arguments required by the runtime */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLKERNELRUNTIME */
diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
new file mode 100644
index 0000000000..549c6d4abb
--- /dev/null
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope
+ */
+class ClAuxTensors
+{
+public:
+ /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo
+ */
+ struct DataView
+ {
+ DataView() = default;
+ DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+ : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+ {
+ }
+ ~DataView() = default;
+ DataView(const DataView &other) = default;
+ DataView &operator=(const DataView &other) = default;
+ DataView(DataView &&other) = default;
+ DataView &operator=(DataView &&other) = default;
+ CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */
+ TensorInfo tensor_info{}; /**< Associated tensor info */
+ AuxMemoryInfo memory_info{}; /**< Memory requirement */
+ };
+
+ /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */
+ std::vector<DataView> get_tensors()
+ {
+ return _tensors;
+ }
+ std::vector<DataView> get_tensors() const
+ {
+ return _tensors;
+ }
+
+ friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);
+
+private:
+ /** Add auxiliary tensor.
+ *
+ * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor
+ * @param[in] memory_info Memory requirements of the auxiliary tensor
+ *
+ * @return CLTensor* Corresponding tensor memory if successfully added, otherwise nullptr
+ */
+ CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)
+ {
+ const auto t_id = tensor_info.id();
+ auto find_tensor_pair = _owned_tensors.find(t_id);
+ if(find_tensor_pair == _owned_tensors.end())
+ {
+ return find_tensor_pair->second.get();
+ }
+ else
+ {
+ auto tensor = std::make_unique<CLTensor>();
+ auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;
+ auto new_tensor = inserted_pair->second.get();
+ _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);
+ return new_tensor;
+ }
+ }
+
+ std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
+ std::vector<DataView> _tensors{};
+};
+/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
+ *
+ * @note This is the only recommended method for user to create @ref ClAuxTensors
+ *
+ * @param[out] aux_tensors Auxiliary tensors required by the workload code
+ * @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
+ *
+ * @return Status
+ */
+Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
+{
+ for(auto t_id : code.tensors())
+ {
+ // Get tensor object
+ const auto workload_arg = code.query_tensor(t_id);
+ ICLTensor *tensor_object = nullptr;
+ if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+ {
+ // Create aux tensor CLTensor object
+ const TensorInfo tensor_info = *workload_arg->tensor_info();
+ ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);
+ const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
+ tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
+ }
+ if(tensor_object == nullptr)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
+ }
+ }
+ return Status{};
+}
+
+/** A fast tensor lookup table for runtime tensor objects retrieval
+ */
+class ClTensorLUT
+{
+public:
+ /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id
+ *
+ * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+ *
+ * @return ITensorPack*
+ */
+ ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
+ {
+ auto tensor_pack = _tensor_packs.find(uwk_id);
+ if(tensor_pack != _tensor_packs.end())
+ {
+ return &(tensor_pack->second);
+ }
+ return nullptr;
+ }
+ /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+ *
+ * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+ *
+ * @return ITensorPack*
+ */
+ ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)
+ {
+ return _tensor_packs.at(uwk_id);
+ }
+
+ friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);
+
+private:
+ /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
+ *
+ * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack
+ * @param[in] tensor_pack Tensor pack to be added
+ */
+ void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)
+ {
+ _tensor_packs[uwk_id] = tensor_pack;
+ }
+ std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};
+};
+
+/** Create a fast tensor lookup table for runtime tensor retrieval
+ *
+ * @param[out] tensor_lut @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels
+ * @param[in] code @ref GpuWorkloadSourceCode which all tensors bind to
+ * @param[in] user_tensors User tensors
+ * @param[in] aux_tensors Auxiliary tensors required by the workload code
+ *
+ * @return Status
+ */
+Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
+{
+ // Combine user tensors and aux tensors
+ std::map<ITensorInfo::Id, CLTensor *> tensor_map;
+ for(auto tensor : user_tensors)
+ {
+ const auto t_id = tensor->info()->id();
+ if(tensor_map.find(t_id) != tensor_map.end())
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+ }
+ tensor_map[t_id] = tensor;
+ }
+ for(const auto &data : aux_tensors.get_tensors())
+ {
+ const auto t_id = data.tensor_info.id();
+ const auto tensor = data.tensor;
+ if(tensor_map.find(t_id) != tensor_map.end())
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
+ }
+ tensor_map[t_id] = tensor;
+ }
+
+ // Add tensor objects into corresponding tensor packs
+ for(auto id_tensor : tensor_map)
+ {
+ const auto t_id = id_tensor.first;
+ const auto tensor_object = id_tensor.second;
+ if(tensor_object == nullptr)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+ }
+ if(tensor_object->allocator()->info().total_size() == 0U)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
+ }
+
+ for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+ {
+ ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
+ if(tensor_pack == nullptr)
+ {
+ tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
+ }
+ else
+ {
+ tensor_pack->add_tensor(t_id, tensor_object);
+ }
+ }
+ }
+ return Status{};
+}
+
+} // namespace
+
+struct ClWorkloadRuntime::Implementation
+{
+ std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
+ std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
+ bool _is_configured{ false };
+ bool _is_prepared{ false };
+ ClTensorLUT _tensor_lut{};
+ ClAuxTensors _aux_tensors{};
+ GpuWorkloadSourceCode _source_code{};
+};
+
+ClWorkloadRuntime::ClWorkloadRuntime()
+ : _impl{ std::make_unique<Implementation>() }
+{
+}
+
+ClWorkloadRuntime::~ClWorkloadRuntime() = default;
+
+Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+ // Generate source code
+ _impl->_source_code = sketch.implementation().generate_source_code();
+ // Configure unit workload from source code
+ for(auto uwk_id : _impl->_source_code.unit_workloads())
+ {
+ const auto work = _impl->_source_code.query_unit_workload(uwk_id);
+ const auto stage = work.stage().stage;
+ auto k = std::make_unique<ClKernelRuntime>();
+ k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
+
+ switch(stage)
+ {
+ case UnitWorkloadStage::Stage::Run:
+ _impl->_kernels.emplace(work.id(), std::move(k));
+ break;
+ case UnitWorkloadStage::Stage::Prepare:
+ _impl->_kernels_prep.emplace(work.id(), std::move(k));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Invalid unit workload stage");
+ }
+ break;
+ }
+ // Create auxiliary tensor objects
+ create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);
+ _impl->_is_configured = true;
+ return Status{};
+}
+
+void ClWorkloadRuntime::prepare()
+{
+ if(!_impl->_is_prepared)
+ {
+ for(auto &id_kernel_pair : _impl->_kernels_prep)
+ {
+ const bool flush_queue = false;
+ const auto uwk_id = id_kernel_pair.first;
+ auto kernel = id_kernel_pair.second.get();
+ CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+ }
+
+ _impl->_is_prepared = true;
+ }
+}
+
+Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)
+{
+ // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,
+ // in which case the lut can be cached during prepare
+ const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ prepare();
+ for(auto &id_kernel_pair : _impl->_kernels)
+ {
+ // Flush the command queue on the last kernel
+ const bool flush_queue = false;
+ const auto uwk_id = id_kernel_pair.first;
+ auto kernel = id_kernel_pair.second.get();
+ CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);
+ }
+ return Status{};
+}
+
+std::vector<std::pair<CLTensor *, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
+{
+ std::vector<std::pair<CLTensor *, AuxMemoryInfo>> aux_tensors;
+ for(const auto &data : _impl->_aux_tensors.get_tensors())
+ {
+ aux_tensors.emplace_back(data.tensor, data.memory_info);
+ }
+ return aux_tensors;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
new file mode 100644
index 0000000000..f118d7d851
--- /dev/null
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+#define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
+
+#include "arm_compute/core/experimental/Types.h"
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** This is a generic class that packs the arguments of an operator. For now, it is only used for tensor-related types
+ * Examples of "tensor-related types": @ref ITensorInfo, @ref ITensor, @ref ICLTensor
+ *
+ * The argument id is the position of the argument within the pack, and is represented by @ref TensorType
+ *
+ * @tparam T Tensor-related type
+ */
+template <typename T>
+class ArgumentPack
+{
+public:
+ /** @ref TensorType encodes the position of a tensor argument within the pack */
+ using Id = TensorType;
+ /** A single argument element within the pack
+ * It contains either a const pointer or a non-const pointer to the Tensor-related type T, but never at the same time
+ */
+ struct PackElement
+ {
+ PackElement() = default;
+ PackElement(const PackElement &elem) = default;
+ PackElement &operator=(const PackElement &elem) = default;
+ PackElement(PackElement &&elem) = default;
+ PackElement &operator=(PackElement &&elem) = default;
+ PackElement(Id id, T *tensor)
+ : id(id), tensor(tensor), ctensor(nullptr)
+ {
+ }
+ PackElement(Id id, const T *ctensor)
+ : id(id), tensor(nullptr), ctensor(ctensor)
+ {
+ }
+
+ Id id{ ACL_UNKNOWN }; /**< Argument id within the pack */
+ T *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */
+ const T *ctensor
+ {
+ nullptr
+ }; /**< Const pointer to tensor-related object */
+ };
+
+public:
+ /** Default constructor */
+ ArgumentPack() = default;
+ /** Destructor */
+ ~ArgumentPack() = default;
+ /** Allow instances of this class to be copy constructed */
+ ArgumentPack<T>(const ArgumentPack<T> &other) = default;
+ /** Allow instances of this class to be copied */
+ ArgumentPack<T> &operator=(const ArgumentPack<T> &other) = default;
+ /** Allow instances of this class to be move constructed */
+ ArgumentPack<T>(ArgumentPack<T> &&other) = default;
+ /** Allow instances of this class to be moved */
+ ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
+ /** Initializer list Constructor */
+ ArgumentPack(const std::initializer_list<PackElement> &l)
+ : _pack{}
+ {
+ for(const auto &e : l)
+ {
+ _pack[e.id] = e;
+ }
+ }
+ /** Add tensor to the pack
+ *
+ * @param[in] id ID of the tensor to add
+ * @param[in] tensor Tensor to add
+ */
+ void add_tensor(Id id, T *tensor)
+ {
+ _pack[id] = PackElement(id, tensor);
+ }
+ /** Add const tensor to the pack
+ *
+ * @param[in] id ID of the tensor to add
+ * @param[in] tensor Tensor to add
+ */
+ void add_const_tensor(Id id, const T *tensor)
+ {
+ _pack[id] = PackElement(id, tensor);
+ }
+ /** Get tensor of a given id from the pack
+ *
+ * @param[in] id ID of tensor to extract
+ *
+ * @return The pointer to the tensor if exist and is non-const else nullptr
+ */
+ T *get_tensor(Id id)
+ {
+ auto it = _pack.find(id);
+ return it != _pack.end() ? it->second.tensor : nullptr;
+ }
+ /** Get constant tensor of a given id
+ *
+ * @param[in] id ID of tensor to extract
+ *
+ * @return The pointer to the tensor (const or not) if exist else nullptr
+ */
+ const T *get_const_tensor(Id id) const
+ {
+ auto it = _pack.find(id);
+ if(it != _pack.end())
+ {
+ return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+ }
+ return nullptr;
+ }
+ /** Remove the tensor stored with the given id
+ *
+ * @param[in] id ID of tensor to remove
+ */
+ void remove_tensor(Id id)
+ {
+ _pack.erase(id);
+ }
+ /** Pack size accessor
+ *
+ * @return Number of tensors registered to the pack
+ */
+ size_t size() const
+ {
+ return _pack.size();
+ }
+ /** Checks if pack is empty
+ *
+ * @return True if empty else false
+ */
+ bool empty() const
+ {
+ return _pack.empty();
+ }
+ /** Get the ACL_SRC_* tensors
+ *
+ * @return std::vector<T *>
+ */
+ std::vector<T *> get_src_tensors()
+ {
+ std::vector<T *> src_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+ {
+ auto tensor = get_tensor(static_cast<TensorType>(id));
+ if(tensor != nullptr)
+ {
+ src_tensors.push_back(tensor);
+ }
+ }
+ return src_tensors;
+ }
+ /** Get the const ACL_SRC_* tensors
+ *
+ * @return std::vector<const T *>
+ */
+ std::vector<const T *> get_const_src_tensors() const
+ {
+ std::vector<const T *> src_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+ {
+ auto tensor = get_const_tensor(static_cast<TensorType>(id));
+ if(tensor != nullptr)
+ {
+ src_tensors.push_back(tensor);
+ }
+ }
+ return src_tensors;
+ }
+ /** Get the ACL_DST_* tensors
+ *
+ * @return std::vector<T *>
+ */
+ std::vector<T *> get_dst_tensors()
+ {
+ std::vector<T *> dst_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+ {
+ auto tensor = get_tensor(static_cast<TensorType>(id));
+ if(tensor != nullptr)
+ {
+ dst_tensors.push_back(tensor);
+ }
+ }
+ return dst_tensors;
+ }
+ /** Get the const ACL_DST_* tensors
+ *
+ * @return std::vector<const T *>
+ */
+ std::vector<const T *> get_const_dst_tensors() const
+ {
+ std::vector<const T *> dst_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+ {
+ auto tensor = get_const_tensor(static_cast<TensorType>(id));
+ if(tensor != nullptr)
+ {
+ dst_tensors.push_back(tensor);
+ }
+ }
+ return dst_tensors;
+ }
+
+private:
+ std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK */
diff --git a/src/dynamic_fusion/sketch/OperatorAttributes.cpp b/src/dynamic_fusion/sketch/OperatorAttributes.cpp
new file mode 100644
index 0000000000..51ec444587
--- /dev/null
+++ b/src/dynamic_fusion/sketch/OperatorAttributes.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Conv2dAttributes &Conv2dAttributes::pad(const Padding2D &pad)
+{
+ _pad = pad;
+ return *this;
+}
+Padding2D Conv2dAttributes::pad() const
+{
+ return _pad;
+}
+Conv2dAttributes &Conv2dAttributes::stride(const Size2D &stride)
+{
+ _stride = stride;
+ return *this;
+}
+Size2D Conv2dAttributes::stride() const
+{
+ return _stride;
+}
+Conv2dAttributes &Conv2dAttributes::dilation(const Size2D &dilation)
+{
+ _dilation = dilation;
+ return *this;
+}
+Size2D Conv2dAttributes::dilation() const
+{
+ return _dilation;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
new file mode 100644
index 0000000000..93881508bb
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuComponentServices.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES
+
+#include "src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Services that are used throughout the creation phase of workload code
+ */
+class GpuComponentServices
+{
+public:
+ /** Default constructor */
+ GpuComponentServices() = default;
+ /** Get reference to component factory */
+ GpuKernelComponentFactory &component_factory()
+ {
+ return _comp_factory;
+ }
+
+private:
+ GpuKernelComponentFactory _comp_factory{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTSERVICES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
new file mode 100644
index 0000000000..9cecfc2ffd
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1)
+{
+ return info0.type == info1.type;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
new file mode 100644
index 0000000000..eb36e91d48
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT
+
+#include "arm_compute/core/TensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain information required to set up a kernel argument at run time
+ */
+struct GpuKernelArgumentInfo
+{
+ /** Enumerate all the tensor arguments variants used by all kernel implementations. */
+ enum class Type : int
+ {
+ Scalar,
+
+ Vector,
+
+ Image,
+ Image_Reinterpret_As_3D,
+ Image_Export_To_ClImage2D,
+
+ Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+ Image_3D_Export_To_ClImage2D,
+
+ Tensor_3D,
+ Tensor_4D,
+ Tensor_4D_t_Buffer,
+ Tensor_4D_t_Image
+ };
+ /** Default constructor */
+ GpuKernelArgumentInfo() = default;
+ /** Constructor */
+ GpuKernelArgumentInfo(Type type)
+ : type{ type }
+ {
+ }
+ Type type{ Type::Tensor_4D_t_Buffer };
+};
+
+bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
+
+/** Kernel argument information linked with its corresponding @ref ITensorInfo
+ */
+class GpuKernelArgument
+{
+public:
+ /** Constructor
+ *
+ * @param[in] tensor_info Associated @ref ITensorInfo
+ * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
+ */
+ GpuKernelArgument(const ITensorInfo &tensor_info,
+ const GpuKernelArgumentInfo &kernel_arg_info)
+ : _tensor_info{ tensor_info },
+ _kernel_arg_info{ kernel_arg_info }
+ {
+ }
+ /** Get workload tensor id */
+ ITensorInfo::Id id() const
+ {
+ return _tensor_info.id();
+ }
+ /** Get associated @ref ITensorInfo */
+ ITensorInfo *tensor_info()
+ {
+ return &_tensor_info;
+ }
+ /** Get associated @ref ITensorInfo */
+ const ITensorInfo *tensor_info() const
+ {
+ return &_tensor_info;
+ }
+ /** Get associated @ref GpuKernelArgumentInfo */
+ GpuKernelArgumentInfo *kernel_argument_info()
+ {
+ return &_kernel_arg_info;
+ }
+ /** Get associated @ref GpuKernelArgumentInfo */
+ const GpuKernelArgumentInfo *kernel_argument_info() const
+ {
+ return &_kernel_arg_info;
+ }
+ /** Check if the associated workload tensor has valid id
+ *
+ * @return true if has valid id
+ * @return false otherwise
+ */
+ bool has_valid_id() const
+ {
+ return _tensor_info.has_valid_id();
+ }
+
+private:
+ TensorInfo _tensor_info{};
+ GpuKernelArgumentInfo _kernel_arg_info{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELARGUMENT */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
new file mode 100644
index 0000000000..6e6422c957
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGraph.h"
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+/** Automatically create memory descriptors for all tensors in the graph
+ *
+ * @param[in] tensors @ref ITensorInfo map
+ * @param[in] graph @ref DependencyGraph of which the @p tensors are a part
+ *
+ * @return MemoryDescriptorMap An assignment map of @ref MemoryDescriptors for each ITensorInfo in the graph
+ */
+MemoryDescriptorMap assign_memory_descriptors(const std::map<ITensorInfo::Id, const ITensorInfo *> tensors, const DependencyGraph &graph)
+{
+ MemoryDescriptorMap mem_map{};
+ for(auto t_id : graph.all_tensors())
+ {
+ const auto &tensor = tensors.at(t_id);
+ // Only global src and dst tensors to the entire component graph are "User" tensors, which are user-specified memories
+ if(is_in(t_id, graph.global_src_tensors()) || is_in(t_id, graph.global_dst_tensors()))
+ {
+ mem_map[t_id] = MemoryDescriptor{ MemoryType::User };
+ }
+ else
+ {
+ AuxMemoryInfo aux_mem_info{ tensor->total_size() };
+ mem_map[t_id] = MemoryDescriptor{ MemoryType::Auxiliary, aux_mem_info };
+ }
+ }
+ return mem_map;
+}
+
+} // namespace
+
+std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+ std::vector<DependencyGraph::TensorId> tensor_ids{};
+ std::transform(
+ std::begin(tensors), std::end(tensors),
+ std::back_inserter(tensor_ids),
+ [](const auto & t)
+ {
+ return t->id();
+ });
+ return tensor_ids;
+}
+
+GpuKernelComponentGraph::GpuKernelComponentGraph(GpuComponentServices *services)
+ : _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+{
+}
+
+GpuKernelComponentStream GpuKernelComponentGraph::fuse() const
+{
+ // Obtain memory descriptor map
+ const auto mem_map = assign_memory_descriptors(_tensors, _dependency_graph);
+ /// @note Fusion constraints (for kernel components) are exactly the same as the invariants of @ref GpuKernelComponentGroup
+ /// Fusion can be framed as a mathematical optimization problem:
+ /// Given fusion constraints, find the "best" fusion patterns possible
+ /// "Best" is ill-defined at the moment. For now we define "best" fusion pattern as one
+ /// which results in the least number of fused kernels ( @ref GpuKernelComponentGroup ) at the end
+
+ /// As the first iteration, we offer a sub-optimal algorithm here which ensures all
+ /// constraints are met, but provides no guarantee that the fusion pattern is optimal
+
+ GpuKernelComponentStream stream{ _services, mem_map };
+ // Break down into linear groups of components (constraint 1), preserving topological order
+ const auto linear_graphs = _dependency_graph.topological_partition();
+
+ // Further divide up the linear groups based on rest of the fusion constraints (rely on component group's invariants)
+ for(const auto &graph : linear_graphs)
+ {
+ for(unsigned int i = 0; i < graph.size(); ++i)
+ {
+ const auto comp = _components.at(graph[i].op).get();
+ // Each new linear graph signals a new component group in the stream
+ if(i == 0)
+ {
+ stream.new_component_group();
+ }
+ // If it violates the component group's invariant / fusion constraint, breaks up the stream by inserting a new group
+ bool success = stream.add_component(comp);
+ if(!success)
+ {
+ stream.new_component_group();
+ success = stream.add_component(comp);
+ ARM_COMPUTE_ERROR_ON(!success);
+ }
+ }
+ }
+ return stream;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
new file mode 100644
index 0000000000..fbcb2c10ea
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph of gpu kernel components
+ * Its main purposes are:
+ * - Perform "graph-level" optimizations like fusion of kernel components (not the fusion of operators)
+ * - Automatically assign memory descriptions @ref MemoryDescriptor of all tensors based on graph topology
+ */
+class GpuKernelComponentGraph
+{
+public:
+ /** Constructor
+ *
+ * @param[in] services @ref GpuComponentServices to be used by the graph
+ */
+ GpuKernelComponentGraph(GpuComponentServices *services);
+ /** Prevent instances of this class from being copy constructed */
+ GpuKernelComponentGraph(const GpuKernelComponentGraph &graph) = delete;
+ /** Prevent instances of this class from being copied */
+ GpuKernelComponentGraph &operator=(const GpuKernelComponentGraph &graph) = delete;
+ /** Allow instances of this class to be move constructed */
+ GpuKernelComponentGraph(GpuKernelComponentGraph &&graph) = default;
+ /** Allow instances of this class to be moved */
+ GpuKernelComponentGraph &operator=(GpuKernelComponentGraph &&graph) = default;
+ /** Create a new component and add it to the component graph
+ * Component id is automatically allocated
+ *
+ * @tparam T Component type
+ * @tparam Args Component argument types
+ *
+ * @param[in] args Component arguments except for component id, which is auto-allocated
+ */
+ template <typename T, typename... Args>
+ void add_new_component(Args &&... args)
+ {
+ auto comp = _services->component_factory().create<T>(std::forward<Args>(args)...);
+ ArgumentPack<ITensorInfo> tensors = comp->tensors();
+ const auto src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
+ const auto dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
+ bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+ ARM_COMPUTE_ERROR_ON(!success);
+ _components[comp->id()] = std::move(comp);
+ for(auto t : tensors.get_const_src_tensors())
+ {
+ _tensors[t->id()] = t;
+ }
+ for(auto t : tensors.get_const_dst_tensors())
+ {
+ _tensors[t->id()] = t;
+ }
+ }
+ /** Perform component fusion and serialize the graph into a stream of component groups
+ */
+ GpuKernelComponentStream fuse() const;
+
+private:
+ static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
+ GpuComponentServices *_services;
+ std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
+ std::map<ITensorInfo::Id, const ITensorInfo *> _tensors;
+ DependencyGraph _dependency_graph{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGRAPH */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
new file mode 100644
index 0000000000..3af4c1429d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentGroup.h"
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+bool GpuKernelComponentGroup::add_component(ComponentPtr component)
+{
+ // note: Constraint 1 is guaranteed as a precondition
+ // Constraint 2
+ if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+ {
+ return false;
+ }
+ // Constraint 3.1: Pattern: (Unfusable + Output)
+ if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+ {
+ return false;
+ }
+ // Constraint 3.2
+ if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+ {
+ return false;
+ }
+ // Constraint 3.3: Disallow multiple output components
+ if(!_components.empty() && get_last_component()->type() == GpuComponentType::Output && component->type() == GpuComponentType::Output)
+ {
+ return false;
+ }
+ // Constraint 4
+ if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+ {
+ return false;
+ }
+ // Constraint 5
+ if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+ {
+ return false;
+ }
+ // Constraint 7
+ if(!_components.empty())
+ {
+ const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor = root_dst_tensors[0];
+ const auto dst_tensors = component->tensors().get_const_dst_tensors();
+ for(const auto &t : root_dst_tensors)
+ {
+ if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ for(const auto &t : dst_tensors)
+ {
+ if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ }
+ // Constraint 8
+ if(!_components.empty())
+ {
+ const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+ const auto dst_tensors = component->tensors().get_const_dst_tensors();
+ for(const auto &t : root_dst_tensors)
+ {
+ if(t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ for(const auto &t : dst_tensors)
+ {
+ if(t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ }
+ // Constraint 9
+ if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+ {
+ return false;
+ }
+ // Constraint 9 corollary
+ if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+ {
+ return false;
+ }
+ _components.push_back(component);
+ return true;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_src_tensors() const
+{
+ if(_components.empty())
+ {
+ return {};
+ }
+ auto src_tensors = _components[0]->tensors().get_const_src_tensors();
+ auto prev_dst_tensor = _components[0]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+ for(unsigned int i = 1; i < _components.size(); ++i)
+ {
+ auto cur_src_tensors = _components[i]->tensors().get_const_src_tensors();
+ for(const auto src_tensor : cur_src_tensors)
+ {
+ if(src_tensor->id() == prev_dst_tensor->id())
+ {
+ continue; // Skip "intermediate" tensors. I.e. tensors that are used to link between two components
+ }
+ src_tensors.push_back(src_tensor);
+ }
+ prev_dst_tensor = _components[i]->tensors().get_const_dst_tensors()[0]; // PRE: Only one dst tensor per component
+ }
+
+ return src_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_dst_tensors() const
+{
+ if(_components.empty())
+ {
+ return {};
+ }
+ const auto dst_tensor_ptrs = _components[_components.size() - 1]->tensors().get_const_dst_tensors();
+ std::vector<const ITensorInfo *> dst_tensors;
+ for(auto tensor_ptr : dst_tensor_ptrs)
+ {
+ dst_tensors.push_back(tensor_ptr);
+ }
+ return dst_tensors;
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_argument_tensors() const
+{
+ std::vector<const ITensorInfo *> arguments;
+ const auto src_tensors = get_src_tensors();
+ const auto dst_tensors = get_dst_tensors();
+ arguments.reserve(src_tensors.size() + dst_tensors.size());
+ arguments.insert(arguments.end(), src_tensors.begin(), src_tensors.end());
+ arguments.insert(arguments.end(), dst_tensors.begin(), dst_tensors.end());
+ return arguments;
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
+{
+ if(empty())
+ {
+ return nullptr;
+ }
+ return _components[0];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_last_component() const
+{
+ if(empty())
+ {
+ return nullptr;
+ }
+ return _components[_components.size() - 1];
+}
+
+GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_previous_component(ComponentId id) const
+{
+ if(empty())
+ {
+ return nullptr;
+ }
+ // Get the index of the requested component
+ size_t ind = 0;
+ for(const auto c : _components)
+ {
+ if(c->id() == id)
+ {
+ break;
+ }
+ ind++;
+ }
+ if(ind == 0 || ind >= _components.size())
+ {
+ return nullptr;
+ }
+ return _components[ind - 1];
+}
+
+bool GpuKernelComponentGroup::is_intermediate_tensor(const ITensorInfo *tensor) const
+{
+ return is_tensor_in(tensor, get_interm_tensors());
+}
+
+size_t GpuKernelComponentGroup::size() const
+{
+ return _components.size();
+}
+bool GpuKernelComponentGroup::empty() const
+{
+ return _components.empty();
+}
+GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index)
+{
+ return _components[index];
+}
+const GpuKernelComponentGroup::ComponentPtr &GpuKernelComponentGroup::operator[](size_t index) const
+{
+ return _components[index];
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::begin()
+{
+ return _components.begin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::iterator GpuKernelComponentGroup::end()
+{
+ return _components.end();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::begin() const
+{
+ return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::end() const
+{
+ return _components.cend();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cbegin() const
+{
+ return _components.cbegin();
+}
+typename std::vector<GpuKernelComponentGroup::ComponentPtr>::const_iterator GpuKernelComponentGroup::cend() const
+{
+ return _components.cend();
+}
+
+std::vector<const ITensorInfo *> GpuKernelComponentGroup::get_interm_tensors() const
+{
+ std::vector<const ITensorInfo *> interm_tensors{};
+ for(unsigned int i = 0; i + 1 < _components.size(); ++i)
+ {
+ auto interm_tensor = _components[i]->tensors().get_const_dst_tensors()[0];
+ interm_tensors.push_back(interm_tensor); // PRE: Only one dst tensor per component
+ }
+
+ return interm_tensors;
+}
+
+bool GpuKernelComponentGroup::is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors)
+{
+ for(auto t : tensors)
+ {
+ if(tensor->id() == t->id())
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
new file mode 100644
index 0000000000..4c9d940594
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
+
+#include "components/Types.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class IGpuKernelComponent;
+/** A group of gpu kernel components to be fused together
+ * PRECONDITIONS:
+ * 1. Fusion is limited to a linear sequence of kernel components
+ * INVARIANTS:
+ * @note These preconditions and invariants are exactly the same as fusion constraints for kernel components
+ * 2. Max number of components that can be fused is @ref GpuKernelComponentGroup::max_fused_components (
+ * excluding any output or input (if any) components.
+ * The max number of output components are bound by the maximum number of dst tensors allowed for a component / component group
+ * )
+ * 3. The fusion is subject to the pattern: (Complex + Simple * | Simple + Simple * | Un-fusable) + Output?
+ * 4. All components but unfusable, have exactly 1 dst tensor
+ * 5. All fused components share the same @ref IGpuKernelComponent::Properties ( @ref UnitWorkloadStage etc. )
+ * 6. All fused components share the same tunable parameters like tile size
+ * 7. All fused components share the same dst tensor shape
+ * 8. All fused components' tensors share the same @ref DataLayout
+ * 9. Maximum number of dst tensors allowed for an component (including unfusable) / component group is @ref GpuKernelComponentGroup::max_dst_tensors
+ * This has an impact on the total number of components supported, which = max_fused_components + max_dst_tensors
+ */
+class GpuKernelComponentGroup
+{
+public:
+ using ComponentPtr = IGpuKernelComponent *;
+ /** Maximum number of components that can be fused into the same component group
+ */
+ static constexpr size_t max_fused_components = 64;
+ /** Maximum number of dst tensors allowed for a component / component
+ */
+ static constexpr size_t max_dst_tensors = 8;
+
+public:
+ /** Default constructor */
+ GpuKernelComponentGroup() = default;
+ /** Allow instances of this class to be copy constructed */
+ GpuKernelComponentGroup(const GpuKernelComponentGroup &) = default;
+ /** Allow instances of this class to be copied */
+ GpuKernelComponentGroup &operator=(const GpuKernelComponentGroup &) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuKernelComponentGroup(GpuKernelComponentGroup &&) = default;
+ /** Allow instances of this class to be moved */
+ GpuKernelComponentGroup &operator=(GpuKernelComponentGroup &&) = default;
+ /** Add a component pointer into the group
+ * If the operation fails, then no change is made to the group
+ *
+ * @param[in] component Pointer to the component to be added
+ *
+ * @return true If the operation is successful
+ * @return false If the operation fails
+ */
+ bool add_component(ComponentPtr component);
+ /** Get source tensors of this group */
+ std::vector<const ITensorInfo *> get_src_tensors() const;
+ /** Get destination tensors of this group */
+ std::vector<const ITensorInfo *> get_dst_tensors() const;
+ /** Get tensor argument of this group
+ * A tensor is an argument if it is a source or destination tensor to the group
+ */
+ std::vector<const ITensorInfo *> get_argument_tensors() const;
+ /** Get the root (first) component of this group */
+ ComponentPtr get_root_component() const;
+ /** Get the last component of this group */
+ ComponentPtr get_last_component() const;
+ /** Get the previous component to the component with id @p id
+ *
+ * @param[in] id Component id of the component whose previous component is of concern
+ *
+ * @return ComponentPtr Pointer to the previous component of the one identified by @p id
+ */
+ ComponentPtr get_previous_component(ComponentId id) const;
+ /** Check if a @ref ITensorInfo is an "intermediate" tensor of the group
+ *
+ * An intermediate tensor is any tensor that is not an argument.
+ *
+ * @param[in] tensor @ref ITensorInfo to be looked up
+ *
+ * @return true If @p tensor is an intermediate tensor
+ * @return false Otherwise
+ */
+ bool is_intermediate_tensor(const ITensorInfo *tensor) const;
+ /** Get the number of components within the group */
+ size_t size() const;
+ /** Check if the component group is empty */
+ bool empty() const;
+ ComponentPtr &operator[](size_t index);
+ const ComponentPtr &operator[](size_t index) const;
+ typename std::vector<ComponentPtr>::iterator begin();
+ typename std::vector<ComponentPtr>::iterator end();
+ typename std::vector<ComponentPtr>::const_iterator begin() const;
+ typename std::vector<ComponentPtr>::const_iterator end() const;
+ typename std::vector<ComponentPtr>::const_iterator cbegin() const;
+ typename std::vector<ComponentPtr>::const_iterator cend() const;
+
+private:
+ std::vector<const ITensorInfo *> get_interm_tensors() const;
+
+ static bool is_tensor_in(const ITensorInfo *tensor, const std::vector<const ITensorInfo *> tensors);
+
+ std::vector<ComponentPtr> _components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
new file mode 100644
index 0000000000..aac84b6c59
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelComponentStream.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuKernelComponentStream::GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
+ : _services{ services }, _component_groups{}, _mem_map{ mem_map }
+{
+}
+
+GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code()
+{
+ GpuWorkloadSourceCode source_code;
+ // Traverse through component groups and assemble workload together
+ for(auto && group : _component_groups)
+ {
+ // Write kernel code
+ GpuLogicalKernel logical_kernel(_services, group);
+ const GpuKernelSourceCode kernel_code = logical_kernel.write_kernel_code();
+ // The whole unit workload stage is determined by the root component
+ const auto unit_workload_stage = group.get_root_component()->properties().stage();
+ source_code.add_unit_workload(kernel_code, unit_workload_stage, _mem_map);
+ }
+ return source_code;
+}
+
+void GpuKernelComponentStream::new_component_group()
+{
+ _component_groups.emplace_back();
+}
+
+bool GpuKernelComponentStream::add_component(IGpuKernelComponent *component)
+{
+ ARM_COMPUTE_ERROR_ON(_component_groups.empty());
+ return _component_groups.back().add_component(component);
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
new file mode 100644
index 0000000000..cbaa7c297b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
+
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A linear sequence of component groups serialized from the @ref GpuKernelComponentGraph
+ * Each component group in the stream denotes a complete kernel that may consist of multiple components
+ *
+ * The main purposes of this class are:
+ * - Facilitate component fusion algorithm by allowing insertions of new component groups into the stream
+ * - Invoke kernel writer and assemble the final @ref GpuWorkloadSourceCode
+ */
+class GpuKernelComponentStream
+{
+public:
+ /** Constructor
+ *
+ * @param[in] services @ref GpuComponentServices to be used throughout the stream
+ * @param[in] mem_map @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
+ */
+ GpuKernelComponentStream(GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+ /** Allow instances of this class to be copy constructed */
+ GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
+ /** Allow instances of this class to be copied */
+ GpuKernelComponentStream &operator=(const GpuKernelComponentStream &stream) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuKernelComponentStream(GpuKernelComponentStream &&stream) = default;
+ /** Allow instances of this class to be moved */
+ GpuKernelComponentStream &operator=(GpuKernelComponentStream &&stream) = default;
+ /** Generate and assemble @ref GpuWorkloadSourceCode from the stream */
+ GpuWorkloadSourceCode write_workload_code();
+ /** Insert a new component group in the stream.
+ * Subsequent components are added to this group until end of stream or the next new_component_group is called
+ */
+ void new_component_group();
+ /** Add a component to the previously created component group
+ * Throw an error if no component group is present in the stream
+ *
+ * @param[in] component Component to be inserted
+ *
+ * @return true If the operation is successful
+ * @return false Otherwise
+ */
+ bool add_component(IGpuKernelComponent *component);
+
+private:
+ GpuComponentServices *_services;
+ std::vector<GpuKernelComponentGroup> _component_groups{};
+ MemoryDescriptorMap _mem_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
new file mode 100644
index 0000000000..7479328d7b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** The argument list of a @ref GpuKernelSourceCode */
+using GpuKernelArgumentList = std::map<ITensorInfo::Id, GpuKernelArgument>;
+
+/** Container of kernel code to be compiled and run in a @ref GpuUnitWorkload
+ */
+class GpuKernelSourceCode
+{
+public:
+ /** Set kernel name */
+ GpuKernelSourceCode &name(const std::string &n)
+ {
+ _name = n;
+ return *this;
+ }
+ /** Set kernel code */
+ GpuKernelSourceCode &code(const std::string &c)
+ {
+ _code = c;
+ return *this;
+ }
+ /** Set kernel config id string */
+ GpuKernelSourceCode &config_id(const std::string &c_id)
+ {
+ _config_id = c_id;
+ return *this;
+ }
+ /** Set kernel build options */
+ GpuKernelSourceCode &build_options(const CLBuildOptions &b_options)
+ {
+ _build_options = b_options;
+ return *this;
+ }
+ /** Set kernel execution window */
+ GpuKernelSourceCode &window(const Window &window)
+ {
+ _window = window;
+ return *this;
+ }
+ /** Set kernel argument list */
+ GpuKernelSourceCode &arguments(const GpuKernelArgumentList &arguments)
+ {
+ _arguments = arguments;
+ return *this;
+ }
+ /** Get kernel name */
+ std::string name() const
+ {
+ return _name;
+ }
+ /** Get kernel code */
+ std::string code() const
+ {
+ return _code;
+ }
+ /** Get kernel config id string */
+ std::string config_id() const
+ {
+ return _config_id;
+ }
+ /** Get kernel build options */
+ const CLBuildOptions &build_options() const
+ {
+ return _build_options;
+ }
+ /** Get kernel execution window */
+ const Window &window() const
+ {
+ return _window;
+ }
+ /** Get kernel argument list */
+ const GpuKernelArgumentList &arguments() const
+ {
+ return _arguments;
+ }
+
+private:
+ std::string _name{};
+ std::string _code{};
+ std::string _config_id{};
+ CLBuildOptions _build_options{};
+ Window _window{};
+ GpuKernelArgumentList _arguments{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
new file mode 100644
index 0000000000..7746f8bbf3
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuLogicalKernel.h"
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
+ : _services{ services }, _comp_group{ components }, _store_components{}
+{
+ add_load_store();
+}
+
+GpuKernelSourceCode GpuLogicalKernel::write_kernel_code()
+{
+ GpuKernelSourceCode code;
+ ClTemplateWriter writer{ _comp_group };
+
+ code.name(writer.get_name());
+ code.code(writer.get_code());
+ code.arguments(writer.get_tensors());
+ code.build_options(writer.get_build_options());
+ code.config_id(writer.get_config_id());
+ code.window(writer.get_window());
+
+ return code;
+}
+
+void GpuLogicalKernel::add_load_store()
+{
+ const auto dst_tensors = _comp_group.get_dst_tensors();
+ // Each dst tensor from the component group requires exactly one store component
+ for(const auto &dst_tensor : dst_tensors)
+ {
+ ArgumentPack<ITensorInfo> tensors;
+ // Pass same destination tensor to both source and destination of the store component
+ // In other words, the addition of a store component does not create a new dst tensor
+ // This way we avoid the issue of the dst tensor of the component group differs from that of a logical kernel
+ // This may seem to violate the acyclic-ness of the component graph. But it is fine because at the point of
+ // the construction of the logical kernel, we do not need a graph representation of components anymore
+ // (the graph has been serialized)
+ tensors.add_const_tensor(ACL_SRC_0, dst_tensor);
+ tensors.add_const_tensor(ACL_DST_0, dst_tensor);
+
+ auto store = _services->component_factory().create<ClComponentStore>(
+ _comp_group.get_root_component()->properties(), // Store component share the same properties as that of the root component
+ tensors);
+ _store_components.push_back(std::move(store));
+ auto success = _comp_group.add_component(_store_components.back().get());
+ ARM_COMPUTE_ERROR_ON(!success); // It's guaranteed that any load store insertion should be successful
+ }
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
new file mode 100644
index 0000000000..4ce4443f60
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+#include <memory>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuComponentServices;
+class IGpuKernelComponent;
+
+/** A wrapper-processor of a @ref GpuKernelComponentGroup
+ * It adds the load (if any) and store components to the component group
+ * The @ref GpuLogicalKernel represents a complete kernel, and can proceed to invoke any kernel writer to generate the full kernel code
+ */
+class GpuLogicalKernel
+{
+public:
+ /** Constructor
+ *
+ * @param[in] services @ref GpuComponentServices to be used
+ * @param[in] components Component group from which this logical kernel is initialized
+ */
+ explicit GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components);
+ /** Allow instances of this class to be copy constructed */
+ GpuLogicalKernel(const GpuLogicalKernel &) = default;
+ /** Allow instances of this class to be copied */
+ GpuLogicalKernel &operator=(const GpuLogicalKernel &) = default;
+ /** Allow instances of this class to be move constructed */
+ GpuLogicalKernel(GpuLogicalKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ GpuLogicalKernel &operator=(GpuLogicalKernel &&) = default;
+ /** Generate a @ref GpuKernelSourceCode */
+ GpuKernelSourceCode write_kernel_code();
+
+private:
+ void add_load_store();
+
+ GpuComponentServices *_services;
+ GpuKernelComponentGroup _comp_group{};
+ std::vector<std::unique_ptr<IGpuKernelComponent>> _store_components{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPULOGICALKERNEL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
new file mode 100644
index 0000000000..e8ef835405
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+{
+ std::vector<DependencyGraph::TensorId> tensor_ids{};
+ std::transform(
+ std::begin(tensors), std::end(tensors),
+ std::back_inserter(tensor_ids),
+ [](const auto & t)
+ {
+ return t->id();
+ });
+ return tensor_ids;
+}
+
+} // namespace
+
+Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
+ : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+{
+}
+
+OperatorId Operator::id() const
+{
+ return _id;
+}
+
+GpuOperatorType Operator::operator_type() const
+{
+ return _operator_type;
+}
+
+ArgumentPack<ITensorInfo> Operator::tensors() const
+{
+ return _tensors;
+}
+
+bool GpuOperatorGroup::try_add_operator(const Operator &op) const
+{
+ const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+ const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+ // Constraint 1
+ if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids))
+ {
+ return false;
+ }
+ // Constraint 2
+ if(_operators.size() >= max_fused_operators)
+ {
+ return false;
+ }
+ // Constraint 3.1: Pattern: (Unfusable)
+ if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+ {
+ return false;
+ }
+ // Constraint 3.2
+ if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+ {
+ return false;
+ }
+ // Constraint 4
+ if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+ {
+ return false;
+ }
+ // Constraint 5
+ if(_operators.size() > 0)
+ {
+ const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor = root_dst_tensors[0];
+ const auto dst_tensors = op.tensors().get_const_dst_tensors();
+ for(const auto &t : root_dst_tensors)
+ {
+ if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ for(const auto &t : dst_tensors)
+ {
+ if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+ {
+ return false;
+ }
+ }
+ }
+ // Constraint 6
+ if(_operators.size() > 0)
+ {
+ const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
+ ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
+ const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
+ const auto dst_tensors = op.tensors().get_const_dst_tensors();
+ for(const auto &t : root_dst_tensors)
+ {
+ if(t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ for(const auto &t : dst_tensors)
+ {
+ if(t->data_layout() != first_dst_tensor_layout)
+ {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+void GpuOperatorGroup::add_operator(const Operator &op)
+{
+ ARM_COMPUTE_ERROR_ON(!try_add_operator(op));
+ const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
+ const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
+ _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids);
+ _operators[op.id()] = op;
+}
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+{
+ auto new_id = static_cast<OperatorId>(_operators.size());
+ return Operator{ new_id, operator_type, tensors };
+}
+const Operator *GpuOperatorGroup::get_root_operator() const
+{
+ const auto roots = _graph.get_root_ops();
+ ARM_COMPUTE_ERROR_ON(roots.size() > 1);
+ if(roots.empty())
+ {
+ return nullptr;
+ }
+ return &_operators.at(roots[0]);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
new file mode 100644
index 0000000000..35abe6c543
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
+#include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using OperatorId = DependencyGraph::OperatorId;
+
+/** An operator for the sole purpose of validating fusion
+ */
+class Operator
+{
+public:
+ /** Default constructor */
+ Operator() = default;
+ /** Get Operator Id */
+ OperatorId id() const;
+ /** Get operator type */
+ GpuOperatorType operator_type() const;
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const;
+ friend class GpuOperatorGroup;
+
+private:
+ Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors);
+ OperatorId _id{};
+ GpuOperatorType _operator_type{};
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+
+/** A linear sequence of operators to be fused in a workload
+ * For the time being, this class is only used for validating operator fusion
+ * INVARIANTS:
+ * @note These invariants are exactly the same as operator fusion constraints
+ * 1. Fusion is limited to a linear sequence of operators
+ * 2. Max number of operators that can be fused is @ref GpuOperatorGroup::max_fused_operators
+ * 3. The fusion is subject to the pattern: Complex + Simple * | Simple + Simple * | Un-fusable
+ * 4. All operator but unfusable, have exactly 1 dst tensor
+ * 5. All fused operators share the same dst tensor shape
+ * 6. All fused operators' tensors share the same @ref DataLayout
+ */
+class GpuOperatorGroup
+{
+public:
+ static constexpr size_t max_fused_operators = 32;
+ /** Try adding (without actually adding) an operator to the group
+ *
+ * @param[in] op Operator to be added
+ *
+ * @return true If @p op can be added while maintaining the invariants
+ * @return false Otherwise
+ */
+ bool try_add_operator(const Operator &op) const;
+ /** Add an operator to the group
+ *
+ * @param[in] op Operator to be added
+ */
+ void add_operator(const Operator &op);
+ /** Create a new operator
+ *
+ * @param[in] operator_type @ref GpuOperatorType of the new operator
+ * @param[in] tensors Tensor arguments to the new operator
+ *
+ * @return Operator
+ */
+ Operator new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const;
+ /** Get the "root operator" of the group, which is the first operator in a linear sequence
+ * @return const Operator* Pointer to the root operator
+ */
+ const Operator *get_root_operator() const;
+
+private:
+ DependencyGraph _graph{};
+ std::map<OperatorId, Operator> _operators{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
new file mode 100644
index 0000000000..c77697c343
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Contain properties common to all operator types */
+
+/** Operator type in the context of fusion
+ */
+enum class GpuOperatorType
+{
+ /** Simple operators are operators that:
+ * 1. Have a 1-to-1 mapping between the input elements and output elements, like elementwise
+ * 2. Have exactly 1 output
+ */
+ Simple,
+ /** Complex operators are operators that are not simple but are still fusable with simple ones
+ */
+ Complex,
+ /** Unfusable operators are operators that cannot be fused with any other types of operators
+ */
+ Unfusable
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORPROPERTIES */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
new file mode 100644
index 0000000000..623bf351f8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
+ : _gpu_language{ GpuLanguage::OpenCL }, _cl_compile_ctx{ cl_compile_ctx }
+{
+}
+
+GpuTarget GpuWorkloadContext::gpu_target() const
+{
+ return _cl_compile_ctx->get_gpu_target();
+}
+
+GpuLanguage GpuWorkloadContext::gpu_language() const
+{
+ return _gpu_language;
+}
+
+const CLCompileContext *GpuWorkloadContext::cl_compile_context() const
+{
+ return _cl_compile_ctx;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
new file mode 100644
index 0000000000..ce7cf1e908
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
+ : _impl{ std::make_unique<Implementation>(context) }
+{
+}
+GpuWorkloadSketch::~GpuWorkloadSketch()
+{
+}
+
+const GpuWorkloadSketch::Context *GpuWorkloadSketch::gpu_context() const
+{
+ return _impl->context();
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info(const ITensorInfo &tensor_info)
+{
+ TensorInfo tensor{ tensor_info };
+ tensor.set_id(allocate_new_tensor_id());
+ return tensor;
+}
+
+TensorInfo GpuWorkloadSketch::create_tensor_info()
+{
+ TensorInfo tensor{};
+ tensor.set_id(allocate_new_tensor_id());
+ return tensor;
+}
+
+ITensorInfo::Id GpuWorkloadSketch::allocate_new_tensor_id()
+{
+ return _impl->allocate_new_tensor_id();
+}
+
+GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation()
+{
+ return *_impl;
+}
+const GpuWorkloadSketch::Implementation &GpuWorkloadSketch::implementation() const
+{
+ return *_impl;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
new file mode 100644
index 0000000000..3997395c98
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Internal implementation of @ref GpuWorkloadSketch */
+class GpuWorkloadSketch::Implementation
+{
+public:
+ /** Constructor
+ *
+ * @param[in] context global workload creation context
+ */
+ explicit Implementation(
+ Context *context)
+ : _context{ context },
+ _comp_services{},
+ _component_graph{ &_comp_services },
+ _operator_group{}
+ {
+ }
+ /** Prevent instances of this class from being copy constructed */
+ Implementation(const Implementation &impl) = delete;
+ /** Prevent instances of this class from being copied */
+ Implementation &operator=(const Implementation &impl) = delete;
+ /** Allow instances of this class to be move constructed */
+ Implementation(Implementation &&impl) = default;
+ /** Allow instances of this class to be moved */
+ Implementation &operator=(Implementation &&impl) = default;
+ /** Get workload context */
+ const Context *context() const
+ {
+ return _context;
+ }
+ /** Get component graph */
+ const GpuKernelComponentGraph &component_graph() const
+ {
+ return _component_graph;
+ }
+ /** Get component graph */
+ GpuKernelComponentGraph &component_graph()
+ {
+ return _component_graph;
+ }
+ /** Get operator group */
+ const GpuOperatorGroup &operator_group() const
+ {
+ return _operator_group;
+ }
+ /** Get operator group */
+ GpuOperatorGroup &operator_group()
+ {
+ return _operator_group;
+ }
+ ITensorInfo::Id allocate_new_tensor_id()
+ {
+ return ++_next_id;
+ }
+ /** Generate @ref GpuWorkloadSourceCode from the workload sketch
+ * @note The sketch must be valid. Any error encountered during the building of the code will be thrown.
+ *
+ * @return GpuWorkloadSourceCode The generated workload code
+ */
+ GpuWorkloadSourceCode generate_source_code() const
+ {
+ return component_graph().fuse().write_workload_code();
+ }
+
+private:
+ Context *_context;
+ GpuComponentServices _comp_services;
+ GpuKernelComponentGraph _component_graph;
+ GpuOperatorGroup _operator_group;
+ ITensorInfo::Id _next_id{ ITensorInfo::invalid_tensor_id };
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL */
diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
new file mode 100644
index 0000000000..2375f5c6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
+using UnitWorkloadId = int32_t;
+
+/** Describes all the info related to a kernel in order to:
+ * - be used by runtime to configure gpu kernel argument
+ * - be used by memory managers to allocate required memory
+ */
+class GpuWorkloadArgument
+{
+public:
+ /** Default constructor */
+ GpuWorkloadArgument() = default;
+ /** Constructor
+ *
+ * @param[in] tensor_info @ref ITensorInfo of the workload argument
+ * @param[in] mem_desc @ref MemoryDescriptor of the workload argument
+ * @param[in] kernel_arg_info @ref GpuKernelArgumentInfo of the workload argument
+ */
+ GpuWorkloadArgument(const ITensorInfo &tensor_info,
+ const MemoryDescriptor &mem_desc,
+ const GpuKernelArgumentInfo &kernel_arg_info)
+ : _tensor_info{ tensor_info },
+ _mem_desc{ mem_desc },
+ _kernel_arg_info{ kernel_arg_info }
+ {
+ }
+ /** Get tensor id within workload */
+ ITensorInfo::Id id() const
+ {
+ return _tensor_info.id();
+ }
+ /** Get @ref ITensorInfo of the argument */
+ ITensorInfo *tensor_info()
+ {
+ return &_tensor_info;
+ }
+ /** Get @ref ITensorInfo of the argument */
+ const ITensorInfo *tensor_info() const
+ {
+ return &_tensor_info;
+ }
+ /** Get @ref MemoryDescriptor of the argument */
+ MemoryDescriptor *memory_descriptor()
+ {
+ return &_mem_desc;
+ }
+ /** Get @ref MemoryDescriptor of the argument */
+ const MemoryDescriptor *memory_descriptor() const
+ {
+ return &_mem_desc;
+ }
+ /** Get @ref GpuKernelArgumentInfo of the argument */
+ GpuKernelArgumentInfo *kernel_argument_info()
+ {
+ return &_kernel_arg_info;
+ }
+ /** Get @ref GpuKernelArgumentInfo of the argument */
+ const GpuKernelArgumentInfo *kernel_argument_info() const
+ {
+ return &_kernel_arg_info;
+ }
+ /** Check if the workload argument has valid id
+ *
+ * @return true If has valid id
+ * @return false Otherwise
+ */
+ bool has_valid_id() const
+ {
+ return _tensor_info.has_valid_id();
+ }
+
+private:
+ TensorInfo _tensor_info{};
+ MemoryDescriptor _mem_desc{};
+ GpuKernelArgumentInfo _kernel_arg_info{};
+};
+
+/** Describes when a unit workload is run.
+ */
+struct UnitWorkloadStage
+{
+ enum class Stage
+ {
+ Prepare, /**< Only run once at the beginning. */
+ Run, /**< Run every time after the first time. */
+ };
+ Stage stage{ Stage::Run };
+};
+
+inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+{
+ return stage0.stage == stage1.stage;
+}
+
+/** The atomic unit in a Gpu workload. It contains exactly one kernel to run.
+ */
+class GpuUnitWorkload
+{
+public:
+ /** Default constructor */
+ GpuUnitWorkload() = default;
+ /** Constructor
+ *
+ * @param[in] id Id that uniquely identifies this unit workload in a workload
+ * @param[in] kernel_code @ref GpuKernelSourceCode contained within
+ * @param[in] stage Stage of the unit workload
+ */
+ GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
+ : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+ {
+ }
+ /** Get the id of the unit workload */
+ UnitWorkloadId id() const
+ {
+ return _id;
+ }
+ /** Get reference to the underlying @ref GpuKernelSourceCode */
+ const GpuKernelSourceCode &code() const
+ {
+ return _kernel_code;
+ }
+ /** Get the stage of the unit workload */
+ UnitWorkloadStage stage() const
+ {
+ return _stage;
+ }
+
+private:
+ UnitWorkloadId _id{};
+ GpuKernelSourceCode _kernel_code{};
+ UnitWorkloadStage _stage{};
+};
+
+/** Hold the generated kernel source code and other information required to compile and run the workload.
+ */
+class GpuWorkloadSourceCode
+{
+public:
+ /** Default constructor */
+ GpuWorkloadSourceCode() = default;
+ /** Add a unit workload to the workload code
+ *
+ * @param[in] kernel_code @ref GpuKernelSourceCode to be contained within the unit workload
+ * @param[in] stage Stage of the unit workload
+ * @param[in] mem_map @ref MemoryDescriptor map for all tensors within the unit workload
+ *
+ * @return UnitWorkloadId Allocated unit workload id
+ */
+ UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map)
+ {
+ // Use the size of the kernel codes as Id
+ const auto uwk_id = static_cast<UnitWorkloadId>(_unit_workloads.size());
+ const auto unit_work = GpuUnitWorkload(uwk_id, kernel_code, stage);
+ _unit_workloads.push_back(unit_work);
+ // Assemble kernel argument with memory descriptor to form workload argument
+ for(const auto &id_arg : kernel_code.arguments())
+ {
+ const auto arg_id = id_arg.first;
+ const auto arg = id_arg.second;
+ _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
+ if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+ {
+ _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
+ }
+ _tensor_uwork_map[arg_id].insert(uwk_id);
+ }
+ return uwk_id;
+ }
+ /** Get a unit workload from its id */
+ const GpuUnitWorkload &query_unit_workload(UnitWorkloadId id) const
+ {
+ ARM_COMPUTE_ERROR_ON(id < 0);
+ return _unit_workloads.at(id);
+ }
+ /** Get all unit workloads sorted in topological order */
+ std::vector<UnitWorkloadId> unit_workloads() const
+ {
+ std::vector<UnitWorkloadId> ids{};
+
+ for(const auto &uwk : _unit_workloads)
+ {
+ ids.push_back(uwk.id());
+ }
+ return ids;
+ }
+ /** Get a @ref GpuWorkloadArgument from its associated tensor id */
+ const GpuWorkloadArgument *query_tensor(ITensorInfo::Id t_id) const
+ {
+ return &_workload_arguments.at(t_id);
+ }
+ /** Get all tensors in the entire workload */
+ std::vector<ITensorInfo::Id> tensors() const
+ {
+ std::vector<ITensorInfo::Id> ids{};
+ for(const auto &id_tensor : _workload_arguments)
+ {
+ ids.push_back(id_tensor.first);
+ }
+ return ids;
+ }
+ /** Get all unit workloads connected to the tensor with @p t_id */
+ std::vector<UnitWorkloadId> get_unit_workloads_from_tensor(ITensorInfo::Id t_id) const
+ {
+ const auto unit_work_set = _tensor_uwork_map.at(t_id);
+ return std::vector<UnitWorkloadId>(unit_work_set.begin(), unit_work_set.end());
+ }
+
+private:
+ std::vector<GpuUnitWorkload> _unit_workloads{};
+ std::map<ITensorInfo::Id, GpuWorkloadArgument> _workload_arguments{};
+ std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSOURCECODE */
diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
new file mode 100644
index 0000000000..ae67790b4b
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
+#include <map>
+#include <string>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** An interface that can write a gpu kernel
+ */
+class IGpuKernelWriter
+{
+public:
+ /** Destructor */
+ virtual ~IGpuKernelWriter()
+ {
+ }
+ /** Generate kernel name */
+ virtual std::string get_name() = 0;
+ /** Generate kernel code */
+ virtual std::string get_code() = 0;
+ /** Generate build options */
+ virtual CLBuildOptions get_build_options() = 0;
+ /** Generate config id string of the entire kernel. This is used for tuning */
+ virtual std::string get_config_id() = 0;
+ /** Generate execution window */
+ virtual Window get_window() const = 0;
+ /** Get the kernel argument lists of the kernel*/
+ virtual std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() = 0;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_IGPUKERNELWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
new file mode 100644
index 0000000000..f7f0029618
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
+
+#include "Types.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Factory class that creates new instances of @ref IGpuKernelComponent by assigning new component ids
+ */
+class GpuKernelComponentFactory
+{
+public:
+ /** Create a new kernel component
+ *
+ * @tparam T Any polymorphic type descending from @ref IGpuKernelComponent
+ * @tparam Args Argument types to construct the kernel component
+ *
+ * @param[in] args Arguments to construct the kernel component
+ *
+ * @return std::unique_ptr<IGpuKernelComponent>
+ */
+ template <typename T, typename... Args>
+ std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+ {
+ return std::make_unique<T>(_count++, std::forward<Args>(args)...);
+ }
+
+private:
+ ComponentId _count{ 0 };
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY */
diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
new file mode 100644
index 0000000000..8bb19155a2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
+
+#include "Types.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Properties common to all kernel component types */
+class KernelProperties
+{
+public:
+ KernelProperties &stage(const UnitWorkloadStage &stage)
+ {
+ _stage = stage;
+ return *this;
+ }
+ UnitWorkloadStage stage() const
+ {
+ return _stage;
+ }
+
+private:
+ UnitWorkloadStage _stage{};
+};
+
+inline bool operator==(const KernelProperties &config0, const KernelProperties &config1)
+{
+ return config0.stage() == config1.stage();
+}
+
+/** Forward declaration */
+class IGpuTemplateComponentWriter;
+
+/** An abstract interface of a component. It enables manipulation by the component graph for purposes like fusion
+ */
+class IGpuKernelComponent
+{
+public:
+ using Properties = KernelProperties;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] properties Kernel component properties
+ * @param[in] tensors Tensor arguments to the components
+ */
+ IGpuKernelComponent(
+ ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors)
+ : _id{ id },
+ _properties{ properties },
+ _tensors{ tensors }
+ {
+ }
+ /** Destructor */
+ virtual ~IGpuKernelComponent()
+ {
+ }
+ /** Get component id */
+ ComponentId id() const
+ {
+ return _id;
+ }
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const
+ {
+ return _tensors;
+ }
+ /** Get properties */
+ Properties properties() const
+ {
+ return _properties;
+ }
+ /** Get template writer for the component */
+ virtual const IGpuTemplateComponentWriter *template_writer() const = 0;
+ /** Get component type */
+ virtual GpuComponentType type() const = 0;
+
+private:
+ ComponentId _id{ -1 };
+ Properties _properties{};
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT */
diff --git a/src/dynamic_fusion/sketch/gpu/components/Types.h b/src/dynamic_fusion/sketch/gpu/components/Types.h
new file mode 100644
index 0000000000..54b3a69057
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/Types.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Uniquely identifies a kernel component within a workload
+ */
+using ComponentId = int32_t;
+
+/** Component type in the context of fusion
+ * Its main purpose is to inform the optimizer how to perform fusion.
+ */
+enum class GpuComponentType
+{
+ Complex,
+ Simple,
+ Unfusable,
+ Output
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_TYPES */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
new file mode 100644
index 0000000000..e94cfd1581
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+using Settings = ClComponentDirectConv2dSettings;
+
+Settings &Settings::export_to_cl_image(bool cl_image)
+{
+ _export_to_cl_image = cl_image;
+ return *this;
+}
+
+bool Settings::export_to_cl_image() const
+{
+ return _export_to_cl_image;
+}
+
+Settings &Settings::fast_relaxed_math(bool fast_relaxed_math)
+{
+ _fast_relaxed_math = fast_relaxed_math;
+ return *this;
+}
+
+bool Settings::fast_relaxed_math() const
+{
+ return _fast_relaxed_math;
+}
+
+Status ClComponentDirectConv2d::validate(
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+{
+ ARM_COMPUTE_UNUSED(properties, settings);
+ const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+
+ // 1. Check validity
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ if(bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
+ }
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ if(bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
+ }
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+ if(bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
+ }
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ // wei shape is correct
+ const DataLayout data_layout = src->data_layout();
+ const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+ // dst shape is correct
+ PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
+ attributes.pad().bottom, DimensionRoundingType{});
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+ misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+
+ // bia shape is correct
+ if(bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
+ "Biases should be one dimensional");
+ }
+
+ // 2. Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ return Status{};
+}
+
+ClComponentDirectConv2d::ClComponentDirectConv2d(
+ ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuKernelComponent{ id, properties, tensors },
+ _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
+{
+}
+ClComponentDirectConv2d::~ClComponentDirectConv2d()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
+{
+ return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
new file mode 100644
index 0000000000..fec22b84a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+class Conv2dAttributes;
+
+/** Component specific settings
+ */
+class ClComponentDirectConv2dSettings
+{
+public:
+ /** Set export_to_cl_image flag */
+ ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image);
+ /** Get export_to_cl_image flag */
+ bool export_to_cl_image() const;
+
+ /** Set fast_relaxed_math flag */
+ ClComponentDirectConv2dSettings &fast_relaxed_math(bool fast_relaxed_math);
+ /** Get fast_relaxed_math flag */
+ bool fast_relaxed_math() const;
+
+private:
+ bool _export_to_cl_image{ false };
+ bool _fast_relaxed_math{ true };
+};
+
+/** Forward declaration */
+class ClTemplateDirectConv2d;
+
+class ClComponentDirectConv2d final : public IGpuKernelComponent
+{
+public:
+ /** Attributes are a set of backend-agnostic parameters that define what a component does */
+ using Attributes = Conv2dAttributes;
+ /** Settings are a set of backend-specific parameters that influence the implementation of a component */
+ using Settings = ClComponentDirectConv2dSettings;
+
+public:
+ /** Validate the component
+ *
+ * @param[in] properties Component properties
+ * @param[in,out] tensors Tensor arguments to the component
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_SRC_1: Weight
+ * - ACL_SRC_2: Bias (Optional)
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_SRC_1: Const
+ * - ACL_SRC_2: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 |
+ * |:--------------|:--------------|:--------------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ */
+ static Status validate(
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Constructor
+ *
+ * Similar to @ref ClComponentDirectConv2d::validate()
+ */
+ ClComponentDirectConv2d(
+ ComponentId id,
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+
+ /** Destructor */
+ ~ClComponentDirectConv2d() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentDirectConv2d(const ClComponentDirectConv2d &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentDirectConv2d &operator=(const ClComponentDirectConv2d &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentDirectConv2d(ClComponentDirectConv2d &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentDirectConv2d &operator=(ClComponentDirectConv2d &&component) = default;
+ /** Get template writer for the component */
+ const IGpuTemplateComponentWriter *template_writer() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Complex;
+ }
+
+private:
+ std::unique_ptr<ClTemplateDirectConv2d> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
new file mode 100644
index 0000000000..f49f397ec1
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClComponentStore.h"
+
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClComponentStore::validate(
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors)
+{
+ ARM_COMPUTE_UNUSED(properties, tensors);
+ return Status{};
+}
+ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+ : IGpuKernelComponent{ id, properties, tensors }, _component_writer{ std::make_unique<ClTemplateStore>(id, tensors) }
+{
+}
+ClComponentStore::~ClComponentStore()
+{
+}
+const IGpuTemplateComponentWriter *ClComponentStore::template_writer() const
+{
+ return _component_writer.get();
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
new file mode 100644
index 0000000000..bf8c9f031e
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
+
+#include "arm_compute/core/Error.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h"
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+template <typename T>
+class ArgumentPack;
+
+class ClComponentStore final : public IGpuKernelComponent
+{
+public:
+ /** Validate the component
+ *
+ * @param[in] properties Component properties
+ * @param[in] tensors Tensor arguments to the components
+ *
+ * @return Status Validation results
+ *
+ * Tensor argument names:
+ * - ACL_SRC_0: Input
+ * - ACL_DST_0: Output
+ *
+ * Tensor argument constness:
+ * - ACL_SRC_0: Const
+ * - ACL_DST_0: Const
+ *
+ * Valid data layouts:
+ * - NHWC
+ *
+ * Valid data type configurations:
+ * |ACL_SRC_0 |ACL_DST_0 |
+ * |:--------------|:--------------|
+ * |All |All |
+ */
+ static Status validate(
+ const Properties &properties,
+ const ArgumentPack<ITensorInfo> &tensors);
+ /** Constructor
+ *
+ * Similar to @ref ClComponentStore::validate()
+ */
+ ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
+ /** Destructor */
+ ~ClComponentStore() override;
+ /** Prevent instances of this class from being copy constructed */
+ ClComponentStore(const ClComponentStore &component) = delete;
+ /** Prevent instances of this class from being copied */
+ ClComponentStore &operator=(const ClComponentStore &component) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClComponentStore(ClComponentStore &&component) = default;
+ /** Allow instances of this class to be moved */
+ ClComponentStore &operator=(ClComponentStore &&component) = default;
+ /** Get template writer for the component */
+ const IGpuTemplateComponentWriter *template_writer() const override;
+ /** Get component type */
+ GpuComponentType type() const override
+ {
+ return GpuComponentType::Output;
+ }
+
+private:
+ std::unique_ptr<ClTemplateStore> _component_writer;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
new file mode 100644
index 0000000000..98c1cc3939
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout)
+{
+ if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
+ {
+ return false;
+ }
+
+ // If not floating point
+ if(!is_data_type_float(tensor->data_type()))
+ {
+ return false;
+ }
+
+ if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
+ {
+ return false;
+ }
+
+ // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
+ if(!image2d_from_buffer_supported(device))
+ {
+ return false;
+ }
+
+ // Check cl image pitch alignment
+ if(get_cl_image_pitch_alignment(device) == 0)
+ {
+ return false;
+ }
+
+ const size_t image_w = tensor->tensor_shape()[0] / 4;
+ const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
+ const size_t max_image_w = device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+ const size_t max_image_h = device.getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+ if(image_w > max_image_w || image_h > max_image_h)
+ {
+ return false;
+ }
+
+ return true;
+}
+
+GpuOperatorType operator_type = GpuOperatorType::Complex;
+} // namespace
+
+Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch,
+ const ITensorInfo *src,
+ const ITensorInfo *wei,
+ const ITensorInfo *bia,
+ const ITensorInfo *dst,
+ const Conv2dAttributes &attributes)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ !src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id());
+ if(bia != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
+ }
+
+ // Perform fusion test
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_const_tensor(ACL_SRC_1, wei);
+ tensors.add_const_tensor(ACL_SRC_2, bia);
+ tensors.add_const_tensor(ACL_DST_0, dst);
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op),
+ "Operator fusion test failed. This operator cannot be fused into the workload");
+
+ // Auto initialize dst tensor info
+ TensorInfo dst_info_to_validate = *dst;
+ const auto data_layout = src->data_layout();
+
+ {
+ auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+ PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right,
+ attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+ auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape));
+ }
+
+ // Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ const auto sketch_ctx = sketch.implementation().context();
+
+ const auto gpu_target = sketch_ctx->gpu_target();
+
+ if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+ ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
+ // Validate Direct Conv2d Component
+ {
+ const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+ auto settings = ClComponentDirectConv2d::Settings();
+
+ settings.export_to_cl_image(
+ export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+ settings.fast_relaxed_math(
+ (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+ && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16));
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_SRC_1, wei);
+ arguments.add_const_tensor(ACL_SRC_2, bia);
+ arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
+ ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDirectConv2d::validate(properties, arguments, attributes, settings));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language");
+ }
+ return Status{};
+}
+
+void GpuConv2d::create_op(GpuWorkloadSketch &sketch,
+ ITensorInfo *src,
+ ITensorInfo *wei,
+ ITensorInfo *bia,
+ ITensorInfo *dst,
+ const Conv2dAttributes &attributes)
+{
+ // Assert validation
+ ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst);
+ const auto data_layout = src->data_layout();
+
+ // Auto initialize dst tensor
+ {
+ auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(),
+ PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right,
+ attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
+ }
+
+ // Translate into components and add to component graph
+ auto &comp_graph = sketch.implementation().component_graph();
+
+ const auto sketch_ctx = sketch.implementation().context();
+
+ const auto gpu_target = sketch_ctx->gpu_target();
+
+ if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+ {
+ const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
+ ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
+
+ // Add Direct Conv2d Component
+ {
+ auto properties = IGpuKernelComponent::Properties();
+ properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+
+ auto settings = ClComponentDirectConv2d::Settings();
+
+ settings.export_to_cl_image(
+ export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
+
+ settings.fast_relaxed_math(
+ (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
+ && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+
+ if(settings.export_to_cl_image())
+ {
+ arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
+ }
+
+ ArgumentPack<ITensorInfo> arguments;
+ arguments.add_const_tensor(ACL_SRC_0, src);
+ arguments.add_const_tensor(ACL_SRC_1, wei);
+ arguments.add_const_tensor(ACL_SRC_2, bia);
+ arguments.add_const_tensor(ACL_DST_0, dst);
+ comp_graph.add_new_component<ClComponentDirectConv2d>(properties, arguments, attributes, settings);
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Unimplemented Gpu language");
+ }
+
+ // Set up fusion test by adding to the Operator Group
+ // Note this has to be performed after all the components have been successfully added to the component graph
+
+ // Pack tensor infos
+ ArgumentPack<ITensorInfo> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, src);
+ tensors.add_tensor(ACL_SRC_1, wei);
+ tensors.add_const_tensor(ACL_SRC_2, bia);
+ tensors.add_tensor(ACL_DST_0, dst);
+
+ const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors);
+ sketch.implementation().operator_group().add_operator(op);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
new file mode 100644
index 0000000000..13c0b141a5
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "GpuKernelVariableTable.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+void GpuKernelVariableTable::declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm, const std::string &alias)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
+ // Do not re-declare if the variable associated with the tensor has already been declared
+ if(get_variable(tensor).has_valid_id())
+ {
+ ARM_COMPUTE_ERROR_ON(!(get_variable(tensor).kernel_argument_info == argument_info));
+ return;
+ }
+ // Declare variable associated with the tensor
+ std::stringstream ss;
+ ss << alias << "_t" << tensor->id();
+ const auto uniq_name = ss.str();
+ TensorVariable var{ tensor->id(), uniq_name, argument_info };
+
+ if(is_interm)
+ {
+ _interm_var = var;
+ _interm_tensors.insert(tensor->id());
+ }
+ else
+ {
+ _vars.emplace(tensor->id(), var);
+ }
+}
+
+GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(const ITensorInfo *tensor) const
+{
+ const TensorVariable empty_var{};
+ if(_vars.find(tensor->id()) != _vars.end())
+ {
+ return _vars.at(tensor->id());
+ }
+ if(_interm_tensors.find(tensor->id()) != _interm_tensors.end())
+ {
+ return _interm_var;
+ }
+ return empty_var;
+}
+
+GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+{
+ VariableList vars{};
+ for(const auto &tensor : tensors)
+ {
+ if(!tensor->has_valid_id())
+ {
+ continue;
+ }
+ vars.push_back(get_variable(tensor));
+ }
+ return vars;
+}
+
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
+ : value{ var.uniq_name }
+{
+}
+
+TagVal::TagVal(const std::string &val)
+ : value{ val }
+{
+}
+
+TagVal::TagVal(const char *val)
+ : value{ std::string(val) }
+{
+}
+
+TagVal::TagVal(const DataType &data_type)
+ : value{ get_cl_type_from_data_type(data_type) }
+{
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
new file mode 100644
index 0000000000..4eee3963c2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "support/Requires.h"
+#include "support/StringSupport.h"
+
+#include <set>
+#include <string>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A table of all the variables used in the kernel
+ * Since fusion is restricted to a linear sequence of components in a kernel, only a single "intermediate variable" (the accumulator) is allowed.
+ * Each kernel has exactly one variable table
+ */
+class GpuKernelVariableTable
+{
+public:
+ /** A tensor variable whose main purposes are:
+ * - Hold the newly assigned @ref GpuKernelArgumentInfo for the associated tensor info
+ * - Hold the generated variable name for the associated tensor info
+ */
+ struct TensorVariable
+ {
+ public:
+ TensorVariable() = default;
+ TensorVariable(const TensorVariable &) = default;
+ TensorVariable &operator=(const TensorVariable &) = default;
+ ITensorInfo::Id id{ ITensorInfo::invalid_tensor_id };
+ std::string uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+ GpuKernelArgumentInfo kernel_argument_info{};
+ bool has_valid_id() const
+ {
+ return id != ITensorInfo::invalid_tensor_id;
+ }
+ };
+ using VariableList = std::vector<TensorVariable>;
+
+public:
+ /** Declare a @ref TensorVariable for a corresponding tensor info.
+ *
+ * @note: Later re-declaration of the intermediate variable will overwrite the previous association to the @ref ITensorInfo
+ * Therefore, the order of declaration is important. It's assumed that the components declaring the variable is already in correct order
+ *
+ * @param[in] tensor Tensor info with which the new variable is associated
+ * @param[in] argument_info Kernel argument information
+ * @param[in] is_interm If the new variable is an intermediate variable
+ * @param[in] alias Alias for the variable. Will be used as part of the variable name
+ */
+ void declare_variable(const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, bool is_interm = false, const std::string &alias = "unnamed");
+ /** Get the @ref TensorVariable associated with @p tensor
+ *
+ * @param[in] tensor Tensor info to be queried
+ *
+ * @return TensorVariable
+ */
+ TensorVariable get_variable(const ITensorInfo *tensor) const;
+ /** Get the @ref TensorVariable list associated with @p tensors
+ * @note Empty tensors are skipped
+ *
+ * @param[in] tensors List of tensor infos to be queried
+ *
+ * @return VariableList
+ */
+ VariableList get_variable_list(const std::vector<const ITensorInfo *> &tensors) const;
+
+private:
+ std::map<ITensorInfo::Id, TensorVariable> _vars{}; /**< Non-intermediate (function parameter) variables*/
+ TensorVariable _interm_var{}; /**< Intermediate variable */
+ std::set<ITensorInfo::Id> _interm_tensors{}; /**< Tensors associated with the single intermediate variable */
+};
+
+/** A tag value will substitute a tag in a string template during its instantiation */
+struct TagVal
+{
+ /** Default constructor */
+ TagVal() = default;
+ /** Construct a @ref TagVal from a @ref GpuKernelVariableTable::TensorVariable */
+ TagVal(const GpuKernelVariableTable::TensorVariable &var);
+ /** Construct a @ref TagVal from an integral type */
+ template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
+ TagVal(T val)
+ : value{ support::cpp11::to_string(val) }
+ {
+ }
+ /** Construct a @ref TagVal from a string */
+ TagVal(const std::string &val);
+ /** Construct a @ref TagVal from a c-style string */
+ TagVal(const char *val);
+ /** Construct a @ref TagVal from a @ref DataType */
+ TagVal(const DataType &data_type);
+ /** Get the value of the TagVal as a converted string */
+ std::string value{};
+};
+
+/** A tag used in a string template is a placeholder string to be substituted by real values during template instantiation */
+using Tag = std::string;
+
+/** Tag lookup table. It is used to instantiate a string template */
+using TagLUT = std::unordered_map<Tag, TagVal>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
new file mode 100644
index 0000000000..c85ddf5a2c
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuKernelComponentGroup;
+class GpuKernelVariableTable;
+
+/** An interface used by @ref ClTemplateWriter to write source code for a kernel component
+ */
+class IGpuTemplateComponentWriter
+{
+public:
+ using ComponentGroup = GpuKernelComponentGroup;
+
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ */
+ IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> tensors)
+ : _id{ id }, _tensors{ tensors }
+ {
+ }
+ /** Destructor */
+ virtual ~IGpuTemplateComponentWriter()
+ {
+ }
+ /** Generate kernel component name */
+ virtual std::string get_name() const = 0;
+ /** Generate kernel component code template
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return std::string Component code
+ */
+ virtual std::string get_component_code(const ComponentGroup &comp_group) const = 0;
+ /** Declare all variables used by the component in the @p vtable
+ *
+ * @param[out] vtable Variable table
+ * @param[in] comp_group Component group of which the component is a part of
+ */
+ virtual void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+ /** Generate the tag look-up table used to instantiate the component code.
+ *
+ * @param[in] vtable Variable table
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return TagLUT Tag lookup table
+ */
+ virtual TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const = 0;
+ /** Generate additional macros used in the component */
+ virtual std::string get_additional_macros() const
+ {
+ return "";
+ }
+ /** Generate the build options used in the component
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return CLBuildOptions Build options
+ */
+ virtual CLBuildOptions get_build_options(const ComponentGroup &comp_group) const
+ {
+ ARM_COMPUTE_UNUSED(comp_group);
+ return CLBuildOptions{};
+ }
+ /** Generate the component config id string used for tuning */
+ virtual std::string get_config_id() const
+ {
+ return "";
+ }
+ /** Generate the header list used in the component */
+ virtual std::set<std::string> get_headers_list() const
+ {
+ return std::set<std::string> {};
+ }
+ /** Generate the execution window for the component */
+ virtual Window get_window() const
+ {
+ return Window{};
+ }
+ /** Get tensor arguments */
+ ArgumentPack<ITensorInfo> tensors() const
+ {
+ return _tensors;
+ }
+ /** Get component id */
+ ComponentId id() const
+ {
+ return _id;
+ }
+
+private:
+ ComponentId _id{ -1 };
+ ArgumentPack<ITensorInfo> _tensors{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_IGPUTEMPLATECOMPONENTWRITER */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
new file mode 100644
index 0000000000..870de64eb8
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateDirectConv2d.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings)
+ : IGpuTemplateComponentWriter{ id, tensors },
+ _src{},
+ _weight{},
+ _bias{},
+ _dst{},
+ _attributes{ attributes },
+ _settings{ settings }
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
+ if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+ {
+ _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
+ }
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst);
+}
+
+std::string ClTemplateDirectConv2d::get_name() const
+{
+ return "direct_conv2d";
+}
+
+std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+
+ const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+ const auto k0 = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx));
+ const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0;
+
+ std::string code = R"_(
+//------------------ START KERNEL {{meta_kernel_id}} ---------------------
+// IN_0(src) {{src}}
+// IN_1(wei) {{weight}}
+)_";
+ if(_bias && _bias->has_valid_id())
+ {
+ code += R"_(
+// IN_1(bia) {{bias}}
+)_";
+ }
+ code += R"_(
+// OUT(dst, accum) {{dst}}
+
+// Initialize the accumulators
+TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
+{
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH {{WEI_WIDTH}}
+#define _IWEI_HEIGHT {{WEI_HEIGHT}}
+#define _ISRC_WIDTH {{src}}_w
+#define _ISRC_HEIGHT {{src}}_h
+#define _ISRC_CHANNELS {{src}}_c
+#define _IDST_WIDTH {{arg_dst}}_w
+#define _IDST_HEIGHT {{arg_dst}}_h
+#define _IDST_CHANNELS {{arg_dst}}_c
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+ // .v = access the whole vector (OpenCL vector)
+ // .s[x] = access the vector element at position x (scalar access)
+ TILE(int, M0, 1, xi);
+ TILE(int, M0, 1, yi);
+
+ // Convert the linear index to coordinate
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ xi[i].v = ((g_ind_1 + i) % _IDST_WIDTH) * {{STRIDE_X}};
+ yi[i].v = ((g_ind_1 + i) / _IDST_WIDTH) * {{STRIDE_Y}};
+ xi[i].v -= {{PAD_LEFT}};
+ yi[i].v -= {{PAD_TOP}};
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ {{dst}}[i].v = 0;
+ })
+
+ for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+ {
+ int ck = 0;
+ int xk = i % _IWEI_WIDTH;
+ int yk = i / _IWEI_WIDTH;
+
+ int k = 0;
+ for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+ {
+ TILE({{SRC_DATA_TYPE}}, M0, K0, a);
+ TILE({{WEI_DATA_TYPE}}, N0, K0, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = {{ZERO_VALUE}};
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = {{ZERO_VALUE}};
+ })
+
+ // Load tile from the src tensor
+ T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+ // Load tile from the weights tensor
+ T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
+
+ ck += K0;
+ }
+
+ // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+ // This #if directive should be removed in case of dynamic tensor support
+)_";
+
+ if(leftover_loop)
+ {
+ code += R"_(
+ // Left-over accumulations
+ for(; k < _ISRC_CHANNELS; ++k)
+ {
+ TILE({{SRC_DATA_TYPE}}, M0, 1, a);
+ TILE({{WEI_DATA_TYPE}}, N0, 1, b);
+
+ // Initialize tiles
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ a[i].v = {{ZERO_VALUE}};
+ })
+
+ LOOP_UNROLLING(int, i, 0, 1, N0,
+ {
+ b[i].v = {{ZERO_VALUE}};
+ })
+
+ // Load tile from the src tensor
+ T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
+
+ // Load tile from the weights tensor
+ // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+ T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
+
+ // Compute the matrix multiplication between two tiles
+ T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
+
+ ++ck;
+ }
+ )_";
+}
+
+code += R"_(
+#undef _I_WEI_WIDTH
+#undef _I_WEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+
+ }
+)_";
+
+ if(_bias && _bias->has_valid_id())
+ {
+ code += R"_(
+ TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
+
+ T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0);
+
+ // c = c + bias[broadcasted]
+ T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
+ )_";
+}
+
+code += R"_(
+}
+//------------------ END KERNEL {{meta_kernel_id}} ---------------------
+)_";
+ return code;
+}
+
+void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+ vtable.declare_variable(
+ _src,
+ GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+ comp_group.is_intermediate_tensor(_src),
+ "src");
+
+ const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+ vtable.declare_variable(
+ _weight,
+ GpuKernelArgumentInfo(weight_type),
+ comp_group.is_intermediate_tensor(_weight),
+ "weight");
+
+ if(_bias && _bias->has_valid_id()) // optional bias
+ {
+ vtable.declare_variable(
+ _bias,
+ GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
+ comp_group.is_intermediate_tensor(_bias),
+ "bias");
+ }
+ vtable.declare_variable(
+ _dst,
+ GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+ comp_group.is_intermediate_tensor(_dst),
+ "dst");
+}
+
+TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+ TagLUT lut{};
+ // Arguments and global shared variables
+ lut["src"] = vtable.get_variable(_src);
+ lut["weight"] = vtable.get_variable(_weight);
+
+ if(_bias && _bias->has_valid_id()) // optional bias
+ {
+ lut["bias"] = vtable.get_variable(_bias);
+ lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
+ }
+ lut["dst"] = vtable.get_variable(_dst);
+
+ const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]);
+ lut["arg_dst"] = dst_argument.uniq_name;
+
+ // Local build options
+ lut["meta_kernel_id"] = id();
+ lut["ACC_DATA_TYPE"] = _src->data_type();
+ lut["SRC_DATA_TYPE"] = _src->data_type();
+ lut["WEI_DATA_TYPE"] = _weight->data_type();
+
+ lut["SRC_TENSOR_TYPE"] = "BUFFER";
+ switch(vtable.get_variable(_weight).kernel_argument_info.type)
+ {
+ case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
+ case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+ case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+ {
+ lut["WEI_TENSOR_TYPE"] = "IMAGE";
+ break;
+ }
+ default:
+ {
+ lut["WEI_TENSOR_TYPE"] = "BUFFER";
+ break;
+ }
+ }
+ const auto width_idx = 1;
+ const auto height_idx = 2;
+ lut["WEI_WIDTH"] = _weight->dimension(width_idx);
+ lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
+
+ lut["STRIDE_X"] = _attributes.stride().x();
+ lut["STRIDE_Y"] = _attributes.stride().y();
+
+ lut["PAD_LEFT"] = _attributes.pad().left;
+ lut["PAD_TOP"] = _attributes.pad().top;
+
+ lut["ZERO_VALUE"] = 0;
+
+ return lut;
+}
+
+CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const
+{
+ const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
+ const DataType data_type = _src->data_type();
+
+ /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future
+ const auto root_window = comp_group.get_root_component()->template_writer()->get_window();
+ const unsigned int n0 = root_window.x().step();
+ const unsigned int m0 = root_window.y().step();
+ const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx));
+ const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
+
+ CLBuildOptions build_opts{};
+ if(_settings.fast_relaxed_math())
+ {
+ build_opts.add_option("-cl-fast-relaxed-math");
+ }
+ else
+ {
+ // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+ // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+ build_opts.add_option("-cl-unsafe-math-optimizations");
+ }
+ build_opts.add_option("-DIS_TILED");
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+ return build_opts;
+}
+
+std::string ClTemplateDirectConv2d::get_config_id() const
+{
+ const DataType data_type = _src->data_type();
+ const DataLayout data_layout = _src->data_layout();
+
+ const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+ const unsigned int kernel_size = _weight->dimension(width_idx);
+
+ std::string config_id{};
+ config_id += lower_string(string_from_data_type(data_type));
+ config_id += "_";
+ config_id += support::cpp11::to_string(kernel_size);
+ config_id += "_";
+ config_id += support::cpp11::to_string(_attributes.stride().x());
+ config_id += "_";
+ config_id += support::cpp11::to_string(_attributes.stride().y());
+ config_id += "_";
+ config_id += support::cpp11::to_string(_dst->dimension(width_idx));
+ config_id += "_";
+ config_id += support::cpp11::to_string(_dst->dimension(height_idx));
+ config_id += "_";
+ config_id += lower_string(string_from_data_layout(data_layout));
+ return config_id;
+}
+
+std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
+{
+ return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+}
+
+Window ClTemplateDirectConv2d::get_window() const
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
+
+ const auto output_shape = _dst->tensor_shape();
+
+ const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u);
+ const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+
+ // Create and configure kernel window
+ Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+
+ const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
+ win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
+ win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
+
+ return win;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
new file mode 100644
index 0000000000..48027a9b8d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateDirectConv2d final : public IGpuTemplateComponentWriter
+{
+public:
+ using Attributes = ClComponentDirectConv2d::Attributes;
+ using Settings = ClComponentDirectConv2d::Settings;
+ /** Constructor
+ *
+ * Similar to @ref ClComponentDirectConv2d::validate()
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ * @param[in] attributes Component attributes
+ * @param[in] settings Component settings
+ */
+ ClTemplateDirectConv2d(ComponentId id,
+ const ArgumentPack<ITensorInfo> &tensors,
+ const Attributes &attributes,
+ const Settings &settings);
+ /** Prevent instances of this class from being copy constructed */
+ ClTemplateDirectConv2d(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+ /** Prevent instances of this class from being copied */
+ ClTemplateDirectConv2d &operator=(const ClTemplateDirectConv2d &direct_conv2d) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClTemplateDirectConv2d(ClTemplateDirectConv2d &&direct_conv2d) = default;
+ /** Allow instances of this class to be moved */
+ ClTemplateDirectConv2d &operator=(ClTemplateDirectConv2d &&direct_conv2d) = default;
+ /** Generate kernel component name */
+ std::string get_name() const override;
+ /** Generate kernel component code template
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return std::string Component code
+ */
+ std::string get_component_code(const ComponentGroup &comp_group) const override;
+ /** Declare all variables used by the component in the @p vtable
+ *
+ * @param[out] vtable Variable table
+ * @param[in] comp_group Component group of which the component is a part of
+ */
+ void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+ /** Generate the tag look-up table used to instantiate the component code.
+ *
+ * @param[in] vtable Variable table
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return TagLUT Tag lookup table
+ */
+ TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+ /** Generate the build options used in the component
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return CLBuildOptions Build options
+ */
+ CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override;
+ /** Generate the component config id string used for tuning */
+ std::string get_config_id() const override;
+ /** Generate the header list used in the component */
+ std::set<std::string> get_headers_list() const override;
+ /** Generate the execution window for the component */
+ Window get_window() const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_weight;
+ const ITensorInfo *_bias;
+ const ITensorInfo *_dst;
+ Attributes _attributes;
+ Settings _settings;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDIRECTCONV2D */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
new file mode 100644
index 0000000000..6c4b8f52f2
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateStore.h"
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+ : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+{
+ _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
+ _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
+}
+
+std::string ClTemplateStore::get_name() const
+{
+ return "store";
+}
+
+std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const
+{
+ ARM_COMPUTE_UNUSED(comp_group);
+ return R"_(
+//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+{
+// This also follows NHWC layout
+// g_ind_0 maps to global_id(0) maps to Channel
+// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window)
+// g_ind_2 maps to global_id(2) maps to N / Batch
+#define _IDST_WIDTH {{dst}}_w
+#define _IDST_HEIGHT {{dst}}_h
+ TILE(uint, M0, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
+
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+ //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+}
+
+)_";
+}
+
+void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+ // ARM_COMPUTE_UNUSED(comp_group)
+ vtable.declare_variable(
+ _src,
+ GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+ comp_group.is_intermediate_tensor(_src),
+ "src");
+ vtable.declare_variable(
+ _dst,
+ GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+ comp_group.is_intermediate_tensor(_dst),
+ "dst");
+}
+
+TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+{
+ TagLUT lut{};
+
+ // Arguments and global shared variables
+ lut["src"] = vtable.get_variable(_src);
+ lut["dst"] = vtable.get_variable(_dst);
+
+ // Local build options
+ lut["meta_kernel_id"] = id();
+ lut["DST_TENSOR_TYPE"] = "BUFFER";
+ const auto dst_info = comp_group.get_dst_tensors()[0];
+ lut["DST_DATA_TYPE"] = dst_info->data_type();
+
+ return lut;
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
new file mode 100644
index 0000000000..3f97a82204
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
+
+#include "arm_compute/core/experimental/Types.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+class ClTemplateStore final : public IGpuTemplateComponentWriter
+{
+public:
+ /** Constructor
+ *
+ * @param[in] id Component id
+ * @param[in] tensors Tensor arguments to the components
+ */
+ ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
+ /** Prevent instances of this class from being copy constructed */
+ ClTemplateStore(const ClTemplateStore &store) = delete;
+ /** Prevent instances of this class from being copied */
+ ClTemplateStore &operator=(const ClTemplateStore &store) = delete;
+ /** Allow instances of this class to be move constructed */
+ ClTemplateStore(ClTemplateStore &&store) = default;
+ /** Allow instances of this class to be moved */
+ ClTemplateStore &operator=(ClTemplateStore &&store) = default;
+ /** Generate kernel component name */
+ std::string get_name() const override;
+ /** Generate kernel component code template
+ *
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return std::string Component code
+ */
+ std::string get_component_code(const ComponentGroup &comp_group) const override;
+ /** Declare all variables used by the component in the @p vtable
+ *
+ * @param[out] vtable Variable table
+ * @param[in] comp_group Component group of which the component is a part of
+ */
+ void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+ /** Generate the tag look-up table used to instantiate the component code.
+ *
+ * @param[in] vtable Variable table
+ * @param[in] comp_group Component group of which the component is a part of
+ *
+ * @return TagLUT Tag lookup table
+ */
+ TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override;
+
+private:
+ const ITensorInfo *_src;
+ const ITensorInfo *_dst;
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE */
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
new file mode 100644
index 0000000000..cb643a741d
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "ClTemplateWriter.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/// @note: some tags can be unused since they could be used only for the macros, or only for the component code
+std::string ClTemplateWriter::replace_tags(const std::string &code_template, const TagLUT &tags)
+{
+ std::string replaced_code = "";
+ bool scanning_pattern = false;
+ std::string pattern_found = "";
+ for(size_t i = 0; i < code_template.size() - 1; ++i)
+ {
+ if(!scanning_pattern)
+ {
+ if(code_template[i] == '{' && code_template[i + 1] == '{')
+ {
+ i += 1;
+ scanning_pattern = true;
+ pattern_found = "";
+ }
+ else
+ {
+ replaced_code += code_template[i];
+ }
+ }
+ else
+ {
+ if(code_template[i] == '}' && code_template[i + 1] == '}')
+ {
+ i += 1;
+ scanning_pattern = false;
+ std::string err = "Pattern " + pattern_found + " not found in tags";
+ ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
+ replaced_code += tags.find(pattern_found)->second.value;
+ }
+ else
+ {
+ pattern_found += code_template[i];
+ }
+ }
+ }
+
+ return replaced_code;
+}
+ClTemplateWriter::~ClTemplateWriter()
+{
+}
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
+ : _components{ components }
+{
+}
+std::string ClTemplateWriter::get_name()
+{
+ return write_kernel_name();
+}
+std::string ClTemplateWriter::get_code()
+{
+ return write_code();
+}
+std::string ClTemplateWriter::get_config_id()
+{
+ std::string config_id = get_name();
+ for(const auto &comp : _components)
+ {
+ config_id += "--" + comp->template_writer()->get_config_id() + "--";
+ }
+
+ return config_id;
+}
+
+CLBuildOptions ClTemplateWriter::get_build_options()
+{
+ CLBuildOptions build_opts{};
+
+ for(const auto &comp : _components)
+ {
+ build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
+ }
+
+ return build_opts;
+}
+
+Window ClTemplateWriter::get_window() const
+{
+ const auto root_comp = _components.get_root_component();
+ ARM_COMPUTE_ERROR_ON_MSG(root_comp == nullptr, "No root component found");
+ return root_comp->template_writer()->get_window();
+}
+
+std::map<ITensorInfo::Id, GpuKernelArgument> ClTemplateWriter::get_tensors()
+{
+ // Assemble GpuKernelArguments
+ std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
+ for(const auto t : _components.get_argument_tensors())
+ {
+ tensors.emplace(
+ t->id(),
+ GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+ }
+ return tensors;
+}
+
+std::string ClTemplateWriter::write_code()
+{
+ ARM_COMPUTE_ERROR_ON_MSG(_components.empty(), "No components found");
+
+ // These data structures will hold the data from all the components in the blueprint
+ std::set<std::string> headers_list{};
+ std::set<std::string> additional_macros{};
+ std::vector<std::string> component_codes{}; // vector because order matters
+
+ // Pass 1: Declare all kernel variables
+ for(auto &component : _components)
+ {
+ component->template_writer()->declare_variables(_vtable, _components);
+ }
+ // Pass 2: Generate component codes
+ for(auto &component : _components)
+ {
+ const auto component_writer = component->template_writer();
+ auto curr_headers_list = component_writer->get_headers_list();
+ auto curr_additional_macros = component_writer->get_additional_macros();
+ auto curr_component_code = component_writer->get_component_code(_components);
+ const auto var_lut = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+ component_codes.push_back(replace_tags(curr_component_code, var_lut));
+
+ headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
+ if(!additional_macros.empty()) // Some components might not have any
+ {
+ additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
+ }
+ }
+
+ // Step 3: Assemble the data gathered by traversing the graph into the string "code"
+ std::string code = "";
+
+ for(auto &header : headers_list)
+ {
+#if defined(EMBEDDED_KERNELS)
+ code += CLKernelLibrary::get().get_program(header).first;
+#else // defined(EMBEDDED_KERNELS)
+ code += "#include \"" + header + "\"\n";
+#endif // defined(EMBEDDED_KERNELS)
+ }
+
+ for(auto &macros : additional_macros)
+ {
+ code += macros;
+ }
+
+ code += write_kernel_signature(_vtable.get_variable_list(_components.get_argument_tensors()));
+
+ code += "\n{\n\n";
+
+ code += " //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
+ code += write_global_section();
+ code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
+
+ for(const auto &component_code : component_codes)
+ {
+ code += component_code;
+ }
+
+ code += "}\n";
+
+ return code;
+}
+std::string ClTemplateWriter::write_global_section() const
+{
+ const auto dst_tensors = _components.get_dst_tensors();
+ ARM_COMPUTE_ERROR_ON_MSG(dst_tensors.size() != 1, "Only one destination tensor per kernel is allowed");
+ const auto dst_info = dst_tensors[0];
+ const auto dst_w = dst_info->dimension(0);
+ const auto tile_w = std::max(1, get_window().x().step());
+ const auto tile_h = std::max(1, get_window().y().step());
+ auto leftover_w = dst_w % tile_w;
+
+ std::string code = "";
+ code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+ code += std::string(" int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
+ code += std::string(" int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
+
+ code += " const bool g_cond_x = (g_ind_0 == 0);\n";
+ code += " const bool g_cond_y = (g_ind_1 == 0);\n";
+
+ return code;
+}
+std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
+{
+ std::string code;
+ switch(var.kernel_argument_info.type)
+ {
+ case GpuKernelArgumentInfo::Type::Vector:
+ {
+ code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")";
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Image:
+ {
+ code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")";
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Image_3D:
+ {
+ code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),";
+ code += "\n unsigned int " + var.uniq_name + "_stride_z";
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
+ {
+ code += "\n __read_only image2d_t " + var.uniq_name + "_img,";
+ code += "\n unsigned int " + var.uniq_name + "_stride_z";
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer:
+ {
+ code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
+ break;
+ }
+ case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
+ {
+ code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR("Unsupported declaration generation for GpuKernelArgumentInfo::Type");
+ }
+ }
+ return code;
+}
+std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const
+{
+ std::string code = "\n__kernel void " + write_kernel_name() + "(";
+
+ for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+ {
+ code += write_argument_declaration(argument_list[i]) + ",";
+ }
+ if(static_cast<int>(argument_list.size()) - 1 >= 0)
+ {
+ code += write_argument_declaration(argument_list[argument_list.size() - 1]);
+ }
+
+ code += ')';
+
+ return code;
+}
+std::string ClTemplateWriter::write_kernel_name() const
+{
+ if(_components.empty())
+ {
+ return "empty_kernel";
+ }
+ std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
+ for(size_t i = 1; i < _components.size(); ++i)
+ {
+ name += "___";
+ name += _components[i]->template_writer()->get_name();
+ }
+
+ return name;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
new file mode 100644
index 0000000000..83f617b6c6
--- /dev/null
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER
+
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Use a templated-string-based method to write kernel code
+ * It stitches the component code templates together based on the valid fusion configuration.
+ * It then instantiates the actual kernel code from the template and the generated tag lookup table.
+ */
+class ClTemplateWriter : public IGpuKernelWriter
+{
+public:
+ /** Instantiates a kernel code string from the kernel code template
+ * @note: some tags can be unused since they could be used only for the macros, or only for the component code
+ *
+ * @param[in] code_template Kernel code template
+ * @param[in] tags Tag lookup table
+ *
+ * @return std::string Instantiated kernel string
+ */
+ static std::string replace_tags(const std::string &code_template, const TagLUT &tags);
+ /** Default constructor */
+ ClTemplateWriter() = default;
+ /** Constructor
+ *
+ * @param[in] components Kernel component group from which the kernel will be generated
+ */
+ ClTemplateWriter(const GpuKernelComponentGroup &components);
+ /** Destructor */
+ ~ClTemplateWriter() override;
+ /** Generate kernel name */
+ std::string get_name() override;
+ /** Generate kernel code */
+ std::string get_code() override;
+ /** Generate build options */
+ CLBuildOptions get_build_options() override;
+ /** Generate config id string of the entire kernel. This is used for tuning */
+ std::string get_config_id() override;
+ /** Generate execution window */
+ Window get_window() const override;
+ /** Get the kernel argument lists of the kernel*/
+ std::map<ITensorInfo::Id, GpuKernelArgument> get_tensors() override;
+
+private:
+ std::string write_kernel_name() const;
+ std::string write_code();
+ std::string write_global_section() const;
+ std::string write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const;
+ std::string write_kernel_signature(const GpuKernelVariableTable::VariableList &argument_list) const;
+
+private:
+ GpuKernelComponentGroup _components{};
+ GpuKernelVariableTable _vtable{};
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEWRITER */
diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
new file mode 100644
index 0000000000..55eb4c5c77
--- /dev/null
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
@@ -0,0 +1,658 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+#define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
+
+#include "arm_compute/core/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <deque>
+#include <map>
+#include <set>
+#include <tuple>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+ return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+} // namespace
+
+/** A multi-input (tensors), multi-output (tensors) acyclic directed graph
+ * Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ */
+class DependencyGraph
+{
+public:
+ using Id = int32_t;
+ using TensorId = Id;
+ using OperatorId = Id;
+ /** Adjacency list
+ *
+ */
+ using AdjList = std::map<Id, std::vector<Id>>;
+
+ /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+ *
+ */
+ struct OpPack
+ {
+ OperatorId op{};
+ std::vector<TensorId> inputs{};
+ std::vector<TensorId> outputs{};
+ friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+ {
+ return std::make_tuple(
+ opp0.op, opp0.inputs, opp0.outputs)
+ == std::make_tuple(
+ opp1.op, opp1.inputs, opp1.outputs);
+ }
+ };
+
+public:
+ DependencyGraph() = default;
+ friend std::ostream &operator<<(std::ostream &os, const DependencyGraph &);
+
+ /** Try adding an operator (without actually adding it), while keeping the graph as a "linear sequence" / list
+ * @note The list is expected to only grow from head to tail
+ *
+ * PRECONDITION: The current graph is already linear
+ *
+ * @return true If the operator can be added while keeping the graph as a linear sequence
+ * @return false Otherwise
+ */
+ bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs) const
+ {
+ ARM_COMPUTE_UNUSED(op, outputs);
+ if(all_ops().empty())
+ {
+ return true;
+ }
+ std::vector<TensorId> common_tensors{};
+ auto existing_tensors = all_tensors();
+ std::sort(existing_tensors.begin(), existing_tensors.end()); // To use std::set_intersection, both input sets must be sorted
+ std::vector<TensorId> sorted_inputs{ inputs };
+ std::sort(sorted_inputs.begin(), sorted_inputs.end());
+ std::set_intersection(existing_tensors.begin(), existing_tensors.end(),
+ sorted_inputs.begin(), sorted_inputs.end(), std::back_inserter(common_tensors));
+ if(common_tensors.size() != 1U)
+ {
+ return false;
+ }
+ const auto linked_tensor = common_tensors[0];
+ const auto tail_ops = get_dst_ops();
+ ARM_COMPUTE_ERROR_ON(tail_ops.size() != 1U); // PRECONDITION
+ const auto tail = tail_ops[0];
+
+ if(!is_in(linked_tensor, dst_tensors(tail)))
+ {
+ return false;
+ }
+ return true;
+ }
+ /** Add an operator, while keeping the graph as a "linear sequence"
+ *
+ * PRECONDITION: The current graph is already linear
+ * INVARIANT: The list can only grow from head to tail
+ * INVARIANT: POSTCONDITION: The graph is linear
+ */
+ void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+ {
+ ARM_COMPUTE_ERROR_ON(!try_add_operator_as_linear(op, inputs, outputs));
+ auto success = add_operator(op, inputs, outputs);
+ ARM_COMPUTE_ERROR_ON(!success);
+ }
+ /** Add a new operator
+ * Return invalid if it violates the DAG invariant
+ * Invalid operation will not change the graph
+ *
+ * @param[in] op Operator to add
+ * @param[in] inputs Input tensors to the operator
+ * @param[in] outputs Output tensors to the operator
+ */
+ bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs)
+ {
+ if(operator_exists(op))
+ {
+ return false;
+ }
+ _adj_src_tensors[op] = {};
+ _adj_dst_tensors[op] = {};
+ for(auto in_tensor : inputs)
+ {
+ // Linking input tensor to operator node will never create a cycle / loop because we guarantee
+ // each op is newly created, so every <input, op> pair / edge is new
+ link_input(op, in_tensor);
+ }
+ for(auto out_tensor : outputs)
+ {
+ // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
+ if(path_exists_from_tensor_to_op(out_tensor, op))
+ {
+ remove_operator(op);
+ return false;
+ }
+ else
+ {
+ link_output(op, out_tensor);
+ }
+ }
+
+ return true;
+ }
+
+ /** Sort the graph in a topological order
+ *
+ * @return std::vector<OpPack>
+ */
+ std::vector<OpPack> topological_sort() const
+ {
+ // Incident degree (number of source operators to an op)
+ std::map<OperatorId, unsigned int> in_degree{};
+ std::set<OperatorId> visited_ops{};
+ std::deque<OperatorId> zero_in_degree_ops{};
+ std::vector<OpPack> sorted_op_packs{};
+ for(auto op : all_ops())
+ {
+ const auto degree = src_ops(op).size();
+ in_degree[op] = degree;
+ if(degree == 0)
+ {
+ zero_in_degree_ops.push_back(op);
+ visited_ops.insert(op);
+ }
+ }
+
+ while(!zero_in_degree_ops.empty())
+ {
+ const OperatorId op = zero_in_degree_ops.front();
+ zero_in_degree_ops.pop_front();
+ sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+
+ for(const auto next_op : dst_ops(op))
+ {
+ if(in_degree[next_op] > 0)
+ {
+ in_degree[next_op]--;
+ }
+ if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
+ {
+ zero_in_degree_ops.push_back(next_op);
+ visited_ops.insert(op);
+ }
+ }
+ }
+
+ return sorted_op_packs;
+ }
+
+ void find_independent_paths_util(OperatorId op, std::vector<std::vector<OperatorId>> &paths, std::vector<OperatorId> cur_path,
+ const std::map<OperatorId, unsigned int> &in_degree) const
+ {
+ // We have found an unresolved dependency
+ if(in_degree.at(op) > 1)
+ {
+ paths.push_back(cur_path);
+ return;
+ }
+ const auto child_ops = dst_ops(op);
+
+ cur_path.push_back(op);
+ // Hit the leaf op
+ if(child_ops.empty())
+ {
+ paths.push_back(cur_path);
+ return;
+ }
+ for(const auto child_op : child_ops)
+ {
+ find_independent_paths_util(child_op, paths, cur_path, in_degree);
+ }
+ }
+ /** Find all independent linear paths from op, which doesn't depend on any other op
+ *
+ * @return std::vector<OpPack>
+ */
+ std::vector<std::vector<OperatorId>> find_independent_paths(OperatorId op,
+ const std::map<OperatorId, unsigned int> &in_degree) const
+ {
+ std::vector<std::vector<OperatorId>> paths;
+ std::vector<OperatorId> cur_path;
+ find_independent_paths_util(op, paths, cur_path, in_degree);
+ return paths;
+ }
+ /** Find a longest linear path from op, which doesn't depend on any other op
+ *
+ * @return std::vector<OpPack>
+ */
+ std::vector<OperatorId> find_longest_independent_path(OperatorId op,
+ const std::map<OperatorId, unsigned int> &in_degree) const
+ {
+ const auto &paths = find_independent_paths(op, in_degree);
+ ARM_COMPUTE_ERROR_ON(paths.empty());
+ size_t max_len = 0;
+ const std::vector<OperatorId> *max_path = nullptr;
+
+ for(const auto &path : paths)
+ {
+ if(path.size() >= max_len)
+ {
+ max_path = &path;
+ max_len = path.size();
+ }
+ }
+ return *max_path;
+ }
+ std::vector<OperatorId> propose_next_path(std::set<OperatorId> &candidate_ops,
+ const std::map<OperatorId, unsigned int> &in_degree) const
+ {
+ if(candidate_ops.empty())
+ {
+ return {};
+ }
+ size_t max_len = 0;
+ std::vector<OperatorId> max_path;
+ OperatorId chosen_op{};
+ for(auto op : candidate_ops)
+ {
+ const auto path = find_longest_independent_path(op, in_degree);
+ if(path.size() >= max_len)
+ {
+ chosen_op = op;
+ max_path = path;
+ max_len = path.size();
+ }
+ }
+ candidate_ops.erase(chosen_op);
+ return max_path;
+ }
+ /** Partition the graph into a list of linear sub-"graphs", while preserving the topological order, and trying to minimize
+ * the number of partitions
+ */
+ std::vector<std::vector<OpPack>> topological_partition() const
+ {
+ // Initialize zero incident degree and zero in degree ops
+ std::map<OperatorId, unsigned int> in_degree{};
+ std::set<OperatorId> candidate_ops{};
+ for(auto op : all_ops())
+ {
+ const auto degree = src_ops(op).size();
+ in_degree[op] = degree;
+ if(degree == 0)
+ {
+ candidate_ops.insert(op);
+ }
+ }
+
+ std::vector<std::vector<OpPack>> sorted_partitions{};
+ while(!candidate_ops.empty())
+ {
+ // generate_longest_path_from_zero_indegree_ops(in_degree, visited_ops, candidate_ops)
+ const auto path = propose_next_path(candidate_ops, in_degree);
+
+ // Append to sorted_partitions
+ std::vector<OpPack> path_op_pack{};
+ for(auto op : path)
+ {
+ path_op_pack.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+ }
+ sorted_partitions.push_back(path_op_pack);
+ // Remove whole path (Update in_degree, visited_ops, candidate_ops)
+ for(auto op : path)
+ {
+ for(const auto next_op_child : dst_ops(op))
+ {
+ if(in_degree[next_op_child] > 0)
+ {
+ in_degree[next_op_child]--;
+ }
+ if(in_degree[next_op_child] == 0 && !is_in(next_op_child, path)) // We do not want to put the proposed path back into candidates
+ {
+ candidate_ops.insert(next_op_child);
+ }
+ }
+ }
+ }
+ return sorted_partitions;
+ }
+
+ /** Strict equality comparison (all internal ids and order of insertion matter).
+ * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+ *
+ *
+ * @param[in] g0
+ * @param[in] g1
+ * @return true If the same
+ * @return false Otherwise
+ */
+ friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+ {
+ // Do not compare id allocators
+ return std::make_tuple(
+ g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops)
+ == std::make_tuple(
+ g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+ }
+ std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
+ {
+ return _adj_src_ops.at(tensor);
+ }
+ std::vector<OperatorId> dst_ops_from_tensor(TensorId tensor) const
+ {
+ return _adj_dst_ops.at(tensor);
+ }
+ /** Get all tensors
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> all_tensors() const
+ {
+ std::vector<TensorId> tensors{};
+ std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
+ {
+ return it.first;
+ });
+ return tensors;
+ }
+ /** Get source tensors of the whole graph
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> global_src_tensors() const
+ {
+ std::vector<TensorId> tensors;
+ for(auto tensor_src_ops : _adj_src_ops)
+ {
+ if(tensor_src_ops.second.empty())
+ {
+ tensors.push_back(tensor_src_ops.first);
+ }
+ }
+ return tensors;
+ }
+ /** Get destination tensors of the whole graph
+ *
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> global_dst_tensors() const
+ {
+ std::vector<TensorId> tensors;
+ for(auto tensor_dst_ops : _adj_dst_ops)
+ {
+ if(tensor_dst_ops.second.empty())
+ {
+ tensors.push_back(tensor_dst_ops.first);
+ }
+ }
+ return tensors;
+ }
+ /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+ *
+ * @return std::vector<OperatorId>
+ */
+ std::vector<OperatorId> get_root_ops() const
+ {
+ std::vector<OperatorId> ops{};
+ const auto op_list = all_ops();
+
+ for(auto op : op_list)
+ {
+ if(src_ops(op).empty())
+ {
+ ops.emplace_back(op);
+ }
+ }
+ return ops;
+ }
+
+private:
+ void link_input(OperatorId op, TensorId in_tensor)
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ if(!tensor_exists(in_tensor))
+ {
+ insert_new_tensor(in_tensor);
+ }
+ ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); // Prevent repetitive linking
+ _adj_src_tensors[op].push_back(in_tensor);
+ _adj_dst_ops[in_tensor].push_back(op);
+ }
+ void link_output(OperatorId op, TensorId out_tensor)
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ if(!tensor_exists(out_tensor))
+ {
+ insert_new_tensor(out_tensor);
+ }
+ ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); // Prevent repetitive linking
+ _adj_dst_tensors[op].push_back(out_tensor);
+ _adj_src_ops[out_tensor].push_back(op);
+ }
+
+ std::vector<OperatorId> src_ops(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ std::vector<OperatorId> ops{};
+ for(TensorId src_tensor : src_tensors(op))
+ {
+ ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+ }
+ return ops;
+ }
+ std::vector<OperatorId> dst_ops(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ std::vector<OperatorId> ops{};
+ for(TensorId dst_tensor : _adj_dst_tensors.at(op))
+ {
+ ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+ }
+ return ops;
+ }
+
+ /** Get source tensors to an operator
+ *
+ * @param[in] op
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> src_tensors(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ return _adj_src_tensors.at(op);
+ }
+ /** Get destination tensors to an operator
+ *
+ * @param[in] op
+ * @return std::vector<TensorId>
+ */
+ std::vector<TensorId> dst_tensors(OperatorId op) const
+ {
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ return _adj_dst_tensors.at(op);
+ }
+ /** Get all operators
+ *
+ * @return std::vector<OperatorId>
+ */
+ std::vector<OperatorId> all_ops() const
+ {
+ std::vector<OperatorId> ops{};
+ std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
+ {
+ return it.first;
+ });
+ return ops;
+ }
+ /** Remove an operator from graph.
+ *
+ * @param[in] op
+ */
+ void remove_operator(OperatorId op)
+ {
+ for(auto src_tensor : _adj_src_tensors.at(op))
+ {
+ auto &dst_ops = _adj_dst_ops.at(src_tensor);
+ dst_ops.erase(
+ std::remove(std::begin(dst_ops), std::end(dst_ops), op),
+ std::end(dst_ops));
+ }
+ for(auto dst_tensor : _adj_dst_tensors.at(op))
+ {
+ auto &src_ops = _adj_src_ops.at(dst_tensor);
+ src_ops.erase(
+ std::remove(std::begin(src_ops), std::end(src_ops), op),
+ std::end(src_ops));
+ }
+ // Remove any isolated tensors
+ // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
+ for(auto t : all_tensors())
+ {
+ if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+ {
+ _adj_src_ops.erase(t);
+ _adj_dst_ops.erase(t);
+ }
+ }
+ _adj_src_tensors.erase(op);
+ _adj_dst_tensors.erase(op);
+ }
+ void insert_new_tensor(TensorId tensor)
+ {
+ _adj_src_ops[tensor] = {};
+ _adj_dst_ops[tensor] = {};
+ }
+ bool tensor_exists(TensorId tensor) const
+ {
+ return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+ }
+ bool operator_exists(OperatorId op) const
+ {
+ return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+ }
+ bool is_src_tensor_of(OperatorId op, TensorId tensor) const
+ {
+ if(!operator_exists(op) || !tensor_exists(tensor))
+ {
+ return false;
+ }
+ const auto op_inputs = src_tensors(op);
+ return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+ }
+ bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
+ {
+ if(!operator_exists(op) || !tensor_exists(tensor))
+ {
+ return false;
+ }
+ const auto op_outputs = dst_tensors(op);
+ return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+ }
+ bool are_connected(OperatorId op, TensorId tensor) const
+ {
+ return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+ }
+ /** If op is the destination / leaf operator of the whole graph
+ *
+ * @param[in] op
+ * @return true
+ * @return false
+ */
+ bool is_dst_op(OperatorId op) const
+ {
+ return dst_ops(op).empty();
+ }
+ std::vector<OperatorId> get_dst_ops() const
+ {
+ std::vector<OperatorId> ops{};
+ const auto op_list = all_ops();
+
+ for(auto op : op_list)
+ {
+ if(is_dst_op(op))
+ {
+ ops.emplace_back(op);
+ }
+ }
+ return ops;
+ }
+ bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
+ {
+ if(!tensor_exists(src_tensor) || !operator_exists(dst_op))
+ {
+ return false;
+ }
+ for(auto child_op : dst_ops_from_tensor(src_tensor))
+ {
+ if(path_exists_from_op_to_op(child_op, dst_op))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
+ {
+ if(!operator_exists(src_op) || !operator_exists(dst_op))
+ {
+ return false;
+ }
+ if(src_op == dst_op)
+ {
+ return true;
+ }
+ if(is_in(src_op, get_dst_ops()))
+ {
+ return false;
+ }
+ for(auto child_tensor : dst_tensors(src_op))
+ {
+ if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+private:
+ AdjList _adj_src_tensors{};
+ AdjList _adj_dst_tensors{};
+ AdjList _adj_src_ops{};
+ AdjList _adj_dst_ops{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH */
diff --git a/tests/SConscript b/tests/SConscript
index b848f27043..95ecd27afa 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -121,7 +121,7 @@ files_validation += Glob('validation/CPP/' + filter_pattern)
if env['opencl']:
if env['experimental_dynamic_fusion']:
test_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
- files_validation += Glob('validation/CL/UNIT/dynamic_fusion/' + filter_pattern)
+ files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern)
filter_pattern = test_env['test_filter']
@@ -291,19 +291,6 @@ if test_env['benchmark_examples']:
prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
arm_compute_benchmark_examples += [ prog ]
- # Dynamic fusion examples
- if env['opencl']:
- if env['experimental_dynamic_fusion']:
- for file in Glob("%s/dynamic_fusion/*.cpp" % examples_folder):
- example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0])
- if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
- prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+[load_whole_archive, arm_compute_lib, noload_whole_archive] + bm_link_flags + extra_link_flags)
- arm_compute_benchmark_examples += [ prog ]
- else:
- #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
- prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
- arm_compute_benchmark_examples += [ prog ]
-
arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples)
Depends(arm_compute_benchmark_examples, arm_compute_test_framework)
Depends(arm_compute_benchmark_examples, arm_compute_lib)
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
new file mode 100644
index 0000000000..87720a629d
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Conv2d, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out = conv2d1x1(direct_conv)(input, weights, bias)
+ */
+ CLScheduler::get().default_reinit();
+
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+ const auto t_input_shape = TensorShape(384, 12, 12);
+ const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+ const auto t_dst_shape = TensorShape(16, 12, 12);
+
+ // Create a new workload sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &gpu_ctx };
+
+ // Fuse conv2d
+ Conv2dAttributes conv2d_attr{};
+ auto input_info = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+ auto weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+ auto dst_info = sketch.create_tensor_info();
+ GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+
+ // Configure runtime
+ ClWorkloadRuntime runtime;
+ runtime.configure(sketch);
+
+ // (Important) Allocate auxiliary tensor memory if there are any
+ // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+ for(auto &data : runtime.get_auxiliary_tensors())
+ {
+ CLTensor *tensor = data.first;
+ AuxMemoryInfo aux_mem_req = data.second;
+ tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+ tensor->allocator()->allocate(); // Use ACL allocated memory
+ // auto buf = cl::Buffer();
+ // tensor->allocator()->import_memory(buf); // Or, import external memory
+ }
+
+ // Construct user tensors
+ CLTensor t_input{};
+ CLTensor t_weight{};
+ CLTensor t_dst{};
+
+ // Initialize user tensors
+ t_input.allocator()->init(input_info);
+ t_weight.allocator()->init(weight_info);
+ t_dst.allocator()->init(dst_info);
+
+ // Allocate and fill user tensors
+ // Instead of using ACL allocator, the user can choose to import memory into the tensors
+ t_input.allocator()->allocate();
+ t_weight.allocator()->allocate();
+ t_dst.allocator()->allocate();
+ fill<float>(CLAccessor(t_input), 0, library.get());
+ fill<float>(CLAccessor(t_weight), 1, library.get());
+
+ // Run runtime
+ runtime.run({ &t_input, &t_weight, &t_dst });
+
+ // Create reference
+ SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+ // Fill reference
+ fill<float>(ref_t_input, 0, library.get());
+ fill<float>(ref_t_weight, 1, library.get());
+
+ auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+ auto ref_t_weight_nchw = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+ auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+ auto t_dst_shape_nchw = t_dst_shape;
+ permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+ PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left, conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+ DimensionRoundingType{});
+ auto ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
+ const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+ RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+ validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+TEST_SUITE(Invalid_Fusion_Should_Fail)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+ */
+ CLScheduler::get().default_reinit();
+
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+ const auto t_input_shape = TensorShape(384, 12, 12);
+ const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+ const auto t_dst_shape = TensorShape(16, 12, 12);
+ auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
+ auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+ auto t_dst_info = TensorInfo();
+
+ Conv2dAttributes conv2d_attr{};
+
+ // Create a new workload sketch
+ auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx };
+ GpuWorkloadSketch sketch{ &gpu_ctx };
+
+ // Create tensor infos
+ auto input_info = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+ auto weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+ auto dst_info = sketch.create_tensor_info();
+
+ // Fuse conv2d into the workload
+ {
+ // Validate operator
+ const auto success = GpuConv2d::validate_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+ ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
+
+ GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+ }
+
+ // Create tensor infos
+ auto weight_info_2 = sketch.create_tensor_info(t_weight_info);
+ auto dst_info_2 = sketch.create_tensor_info();
+
+ // Fuse conv2d into the workload
+ {
+ // Validate operator, should fail
+ const auto success = GpuConv2d::validate_op(sketch, &dst_info, &weight_info_2, nullptr, &dst_info_2, conv2d_attr);
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ }
+}
+TEST_SUITE_END() // Invalid_Fusion_Should_Fail
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute