aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2022-01-28 18:24:39 +0000
committerSiCong Li <sicong.li@arm.com>2022-05-06 15:01:45 +0000
commitb63b1196adea8b07dd8db77c2492a212650deba0 (patch)
treeb264035197873f56c69784bec68cad7041b5d423
parent3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 (diff)
downloadComputeLibrary-b63b1196adea8b07dd8db77c2492a212650deba0.tar.gz
Integrate Dynamic Fusion patches
* Add public interfaces: * OperatorGraph: Describe a workload that could contain fused kernels * IWorkload: Generic interface for workloads built from OperatorGraph * ClWorkload: OpenCL workloads built from OperatorGraph * ClCompositeOperator: Runtime async operator to execute a ClWorkload * DependencyGraph (will likely be deprecated in later iterations) * Add example * cl_fused_conv2d_elementwise_add.cpp to explain how to use the new interfaces * Add internal translation layer * Refactor ClKernelBuildingAPI * Remove non-tile based gemm native kernel component * Minor interface changes * Add integration tests Resolves COMPMID-5161 Signed-off-by: SiCong Li <sicong.li@arm.com> Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp8
-rw-r--r--arm_compute/core/TensorInfo.h18
-rw-r--r--arm_compute/core/Types.h17
-rw-r--r--arm_compute/core/Window.h21
-rw-r--r--arm_compute/core/Window.inl7
-rw-r--r--arm_compute/core/experimental/ClWorkload.h220
-rw-r--r--arm_compute/core/experimental/DependencyGraph.h278
-rw-r--r--arm_compute/core/experimental/IWorkload.h133
-rw-r--r--arm_compute/core/experimental/OperatorGraph.h211
-rw-r--r--arm_compute/core/experimental/Types.h28
-rw-r--r--arm_compute/runtime/CL/CLScheduler.h7
-rw-r--r--arm_compute/runtime/CL/CLTuner.h2
-rw-r--r--arm_compute/runtime/CL/ICLTuner.h3
-rw-r--r--arm_compute/runtime/experimental/ClCompositeOperator.h191
-rw-r--r--docs/DoxygenLayout.xml2
-rw-r--r--examples/SConscript11
-rw-r--r--examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp386
-rw-r--r--examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp223
-rw-r--r--filelist.json11
-rw-r--r--src/core/CL/ICLKernel.h2
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp79
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h201
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h366
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h8
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp202
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h23
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp153
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h13
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp555
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h83
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h9
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp81
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h20
-rw-r--r--src/core/experimental/dynamic_fusion/OperatorGraph.cpp236
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp233
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h453
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h112
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp219
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h240
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp73
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp431
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h242
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp387
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h229
-rw-r--r--src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp64
-rw-r--r--src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h48
-rw-r--r--src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp242
-rw-r--r--src/runtime/CL/CLScheduler.cpp4
-rw-r--r--src/runtime/CL/CLTuner.cpp6
-rw-r--r--support/DeepCopy.h203
-rw-r--r--tests/SConscript14
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp515
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp267
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp403
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/Utils.h71
55 files changed, 6509 insertions, 1755 deletions
diff --git a/Android.bp b/Android.bp
index c072c0e371..d1efc0a632 100644
--- a/Android.bp
+++ b/Android.bp
@@ -371,8 +371,13 @@ cc_library_static {
"src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
+ "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp",
"src/core/helpers/SoftmaxHelpers.cpp",
"src/core/helpers/WindowHelpers.cpp",
"src/core/utils/AssemblyUtils.cpp",
@@ -674,6 +679,7 @@ cc_library_static {
"src/gpu/cl/operators/ClSub.cpp",
"src/gpu/cl/operators/ClTranspose.cpp",
"src/gpu/cl/operators/ClWinogradConv2d.cpp",
+ "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp",
"src/runtime/Allocator.cpp",
"src/runtime/BlobLifetimeManager.cpp",
"src/runtime/BlobMemoryPool.cpp",
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 9bc86806fb..40f9ed9806 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -297,6 +297,7 @@ public:
_are_values_constant = are_values_constant;
return *this;
}
+ inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
private:
/** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
@@ -320,5 +321,20 @@ private:
DataLayout _data_layout;
bool _are_values_constant;
};
+
+/** Check whether two tensor info are equal.
+ *
+ * @param[in] lhs LHS tensor info.
+ * @param[in] rhs RHS tensor info.
+ *
+ * @return True if the given tensor infos are the same.
+ */
+inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
+{
+ return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
+ && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
+ && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
+ && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant);
+}
} // namespace arm_compute
#endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 1548816e91..7ae6a7e67e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -253,9 +253,22 @@ struct ValidRegion
return *this;
}
+ /** Check whether two valid regions are equal.
+ *
+ * @param[in] lhs LHS valid region
+ * @param[in] rhs RHS valid region
+ *
+ * @return True if the valid regions are the same.
+ */
+ inline friend bool operator==(const ValidRegion &lhs, const ValidRegion &rhs);
+
Coordinates anchor; /**< Anchor for the start of the valid region. */
TensorShape shape; /**< Shape of the valid region. */
};
+inline bool operator==(const ValidRegion &lhs, const ValidRegion &rhs)
+{
+ return (lhs.anchor == rhs.anchor) && (lhs.shape == rhs.shape);
+}
/** Methods available to handle borders */
enum class BorderMode
@@ -346,7 +359,7 @@ struct BorderSize
*
* @return true if they are equal
*/
- bool operator==(const BorderSize &rhs)
+ bool operator==(const BorderSize &rhs) const
{
return (top == rhs.top) && (right == rhs.right) && (bottom == rhs.bottom) && (left == rhs.left);
}
@@ -357,7 +370,7 @@ struct BorderSize
*
* @return true if they are different
*/
- bool operator!=(const BorderSize &rhs)
+ bool operator!=(const BorderSize &rhs) const
{
return !(*this == rhs);
}
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index f603e6c148..c566cffa88 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2020, 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -123,6 +123,17 @@ public:
{
_end = end;
}
+ /** Check whether two Dimensions are equal.
+ *
+ * @param[in] lhs LHS Dimensions
+ * @param[in] rhs RHS Dimensions
+ *
+ * @return True if the Dimensions are the same.
+ */
+ friend bool operator==(const Dimension &lhs, const Dimension &rhs)
+ {
+ return (lhs._start == rhs._start) && (lhs._end == rhs._end) && (lhs._step == rhs._step);
+ }
private:
int _start; /**< Start of the dimension */
@@ -414,6 +425,14 @@ public:
* @param[in] rhs Second window to swap.
*/
friend void swap(Window &lhs, Window &rhs);
+ /** Check whether two Windows are equal.
+ *
+ * @param[in] lhs LHS window
+ * @param[in] rhs RHS window
+ *
+ * @return True if the given windows are the same.
+ */
+ friend bool operator==(const Window &lhs, const Window &rhs);
private:
/** First slice of the window
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 6100d09a1c..5ee4b57145 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2020, 2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -305,4 +305,9 @@ inline void swap(Window &lhs, Window &rhs)
{
lhs._dims.swap(rhs._dims);
}
+
+inline bool operator==(const Window &lhs, const Window &rhs)
+{
+ return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted);
+}
} // namespace arm_compute
diff --git a/arm_compute/core/experimental/ClWorkload.h b/arm_compute/core/experimental/ClWorkload.h
new file mode 100644
index 0000000000..bcac08b9f7
--- /dev/null
+++ b/arm_compute/core/experimental/ClWorkload.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/experimental/IWorkload.h"
+#include "arm_compute/core/experimental/OperatorGraph.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Verbose and explicit way to enumerate all the tensor arguments variants used by
+ * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
+ */
+enum class ClKernelTensorArgType : int
+{
+ Scalar,
+
+ Vector,
+
+ Image,
+ Image_Reinterpret_As_3D,
+ Image_Export_To_ClImage2D,
+
+ Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+ Image_3D_Export_To_ClImage2D,
+
+ Tensor_3D,
+ Tensor_4D,
+ Tensor_4D_t_Buffer,
+ Tensor_4D_t_Image
+};
+
+/** Describes all the info required to add a kernel argument at run time
+ *
+ * @note This struct can later be expanded into a more concise and formal way to specify how to set up
+ * arguments for a kernel inside a @ref ClUnitWorkload
+ */
+struct ClKernelArgDescriptor
+{
+ ClKernelArgDescriptor() = default;
+ ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
+ : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
+ {
+ }
+ ~ClKernelArgDescriptor() = default;
+ friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
+ {
+ return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
+ }
+ int arg_id{ -1 }; /**< Arg ID in the blueprint, -1 means empty / uninitialized */
+ ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
+ bool slide_along_dimz{ true }; /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
+};
+
+using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;
+
+/** Descriptor containing information required to run a single ClWorkload
+ */
+struct ClExecutionDescriptor
+{
+ cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */
+ cl::NDRange gws{}; /**< Global work-group to be used */
+ bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
+};
+
+/** Contains kernel code to be compiled and run in a ClUnitWorkload
+ */
+struct ClKernelCode
+{
+ friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
+ {
+ return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
+ && (code0.arguments == code1.arguments);
+ }
+ std::string name{}; /**< Kernel name */
+ std::string code{}; /**< Kernel source code */
+ std::string config_id{}; /**< Generated from blueprint based on complex component */
+ CLBuildOptions build_options{}; /**< Kernel build options */
+ Window window{}; /**< Execution window */
+ ClKernelArgList arguments{}; /**< Kernel argument descriptors. map key is kernel ArgumentID */
+};
+
+/** A descriptor of ClWorkload Tensors.
+ */
+struct ClWorkloadTensor : public WorkloadTensor
+{
+ ClWorkloadTensor() = default;
+ ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
+ : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
+ {
+ }
+ ClKernelArgDescriptor kernel_arg{};
+ friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
+ {
+ return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
+ }
+};
+
+/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
+ */
+struct ClUnitWorkload : public UnitWorkload
+{
+ ClUnitWorkload() = default;
+ ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
+ : UnitWorkload{ id, stage }, code{ code }
+ {
+ }
+ friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
+ {
+ return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
+ }
+ ClKernelCode code{};
+};
+
+/** GPU information for @ref ClWorkloadContext
+ */
+struct GpuInfo
+{
+ friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
+ {
+ return info0.target == info1.target;
+ }
+ GPUTarget target{ GPUTarget::UNKNOWN };
+};
+
+/** Context (device capabilities, platform details) associated with a ClWorkload
+ *
+ * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
+ */
+struct ClWorkloadContext
+{
+ friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
+ {
+ return ctx0.gpu_info == ctx1.gpu_info;
+ }
+ GpuInfo gpu_info{};
+};
+
+/** Workload for Cl backend
+ */
+struct ClWorkload : public IWorkload
+{
+ Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
+ {
+ Tid id = graph.add_tensor(merge_point);
+ if(tensors.find(id) == tensors.end())
+ {
+ tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
+ }
+ return id;
+ }
+ UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
+ {
+ auto op = graph.add_operator(inputs, outputs);
+ auto id = op.second;
+ unit_workloads[id] = ClUnitWorkload(id, stage, code);
+ return id;
+ }
+ friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
+ {
+ return std::make_tuple(
+ workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
+ == std::make_tuple(
+ workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
+ }
+ ClWorkloadContext context{}; /**< Workload context*/
+ std::map<UnitWorkId, ClUnitWorkload> unit_workloads{}; /**< Unit workloads to run*/
+ std::map<Tid, ClWorkloadTensor> tensors{}; /**< Workload tensors*/
+ std::map<Tid, OpTensor::Id> op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
+ Status status{}; /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
+};
+
+/** Build a @ref ClWorkload from an @ref OperatorGraph.
+ *
+ * @param[out] workload
+ * @param[in] op_graph
+ * @param[in] ctx
+ * @return Status
+ */
+Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H \ No newline at end of file
diff --git a/arm_compute/core/experimental/DependencyGraph.h b/arm_compute/core/experimental/DependencyGraph.h
new file mode 100644
index 0000000000..794bf0e344
--- /dev/null
+++ b/arm_compute/core/experimental/DependencyGraph.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
+
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+ return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+
+/** The dependency graph of a workload, where the nodes are of 2 types: Tensor or Operator
+ * Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ *
+ * A "Merge Tensor" is an external tensor associated with the tensor within the graph, and serve as a merge point
+ */
+class DependencyGraph
+{
+public:
+ /** A serial Id allocator
+ *
+ */
+ class SerialIdAllocator
+ {
+ public:
+ using Id = int;
+ Id alloc()
+ {
+ return _counter++;
+ }
+ constexpr static Id empty()
+ {
+ return -1;
+ }
+
+ private:
+ Id _counter{ 0 };
+ };
+ using Id = SerialIdAllocator::Id;
+ /** Adjacency list
+ *
+ */
+ using AdjList = std::map<Id, std::vector<Id>>;
+
+ /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+ *
+ */
+ struct OpPack
+ {
+ Id op{};
+ std::vector<Id> inputs{};
+ std::vector<Id> outputs{};
+ friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+ {
+ return std::make_tuple(
+ opp0.op, opp0.inputs, opp0.outputs)
+ == std::make_tuple(
+ opp1.op, opp1.inputs, opp1.outputs);
+ }
+ };
+
+public:
+ constexpr static Id empty_id()
+ {
+ return SerialIdAllocator::empty();
+ }
+
+ DependencyGraph() = default;
+ // Used in cases where two DependencyGraphs may want to share the same configuration of tensors
+ explicit DependencyGraph(const std::vector<Id> &imported_tensors);
+ // Testing only
+ DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points = {});
+
+ /** Add a new tensor
+ *
+ * @param merge_tensor The external merge point associated with the tensor. Leave empty if not needed.
+ * @return Id The newly allocated tensor, or a previously added tensor associated with @p merge_tensor
+ */
+ Id add_tensor(Id merge_tensor = empty_id());
+
+ void remove_tensor(Id tensor);
+
+ /** Add a new operator
+ *
+ * @param inputs Input tensors to the operator
+ * @param outputs Output tensors to the operator
+ * @return std::pair<Status, DependencyGraph::Id> where id is the newly allocated operator
+ */
+ std::pair<Status, DependencyGraph::Id> add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs);
+
+ void remove_operator(Id op);
+ /** Sort the graph in a topological order
+ *
+ * @return std::pair<Status, std::vector<OpPack>>
+ */
+ std::pair<Status, std::vector<OpPack>> topological_sort() const;
+
+ std::vector<Id> src_ops(Id op) const;
+ std::vector<Id> dst_ops(Id op) const;
+
+ std::vector<Id> src_ops_from_tensor(Id tensor) const;
+ std::vector<Id> dst_ops_from_tensor(Id tensor) const;
+ /** Get the merge points object
+ *
+ * @return std::map<Id, Id>
+ */
+ std::map<Id, Id> get_merge_points() const;
+ /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+ *
+ * @return std::vector<Id>
+ */
+ std::vector<Id> get_root_ops() const;
+ /** Get all dst ops of the whole graph
+ *
+ * @return std::vector<Id>
+ */
+ std::vector<Id> get_dst_ops() const;
+
+ /** Get source tensors to an operator
+ *
+ * @param op
+ * @return std::vector<Id>
+ */
+ std::vector<Id> src_tensors(Id op) const;
+ /** Get destination tensors to an operator
+ *
+ * @param op
+ * @return std::vector<Id>
+ */
+ std::vector<Id> dst_tensors(Id op) const;
+ /** Get source tensors of the whole graph
+ *
+ * @return std::vector<Id>
+ */
+ std::vector<Id> src_tensors() const;
+ /** Get destination tensors of the whole graph
+ *
+ * @return std::vector<Id>
+ */
+ std::vector<Id> dst_tensors() const;
+ /** Get all operators
+ *
+ * @return std::vector<Id>
+ */
+ std::vector<Id> all_ops() const;
+ /** Get all tensors
+ *
+ * @return std::vector<Id>
+ */
+ std::vector<Id> all_tensors() const;
+ /** Number of operators
+ *
+ * @return unsigned int
+ */
+ unsigned int number_of_ops() const;
+ /** Number of tensors
+ *
+ * @return unsigned int
+ */
+ unsigned int number_of_tensors() const;
+
+ /** Update @p merge_point to point to @p t_id
+ *
+ * @param t_id
+ * @param merge_point
+ */
+ Status update_merge_point(Id t_id, Id merge_point);
+
+ /** Strict equality comparison (all internal ids and order of insertion matter).
+ * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+ *
+ *
+ * @param g0
+ * @param g1
+ * @return true
+ * @return false
+ */
+ friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+ {
+ // Do not compare id allocators
+ return std::make_tuple(
+ g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops, g0._merge_to_internal)
+ == std::make_tuple(
+ g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops, g1._merge_to_internal);
+ }
+ void link_input(Id op, Id in_tensor);
+ void link_output(Id op, Id out_tensor);
+ /** Check if there's a path from @p src_tensor to @p dst_op
+ *
+ * @param src_tensor
+ * @param dst_op
+ * @return true
+ * @return false
+ */
+ bool path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const;
+ /** Check if there's a path from @p src_op to @p dst_op
+ *
+ * @param src_op
+ * @param dst_op
+ * @return true
+ * @return false
+ */
+ bool path_exists_from_op_to_op(Id src_op, Id dst_op) const;
+ /** Check if tensor is the src tensor of the entire graph
+ *
+ * @param tensor
+ * @return true
+ * @return false
+ */
+ bool is_src_tensor(Id tensor) const;
+ /** Check if tensor is the dst tensor of the entire graph
+ *
+ * @param tensor
+ * @return true
+ * @return false
+ */
+ bool is_dst_tensor(Id tensor) const;
+
+private:
+ Id insert_new_tensor();
+ Id insert_new_op();
+ bool tensor_exists(Id tensor) const;
+ bool operator_exists(Id op) const;
+ bool is_src_tensor_of(Id op, Id tensor) const;
+ bool is_dst_tensor_of(Id op, Id tensor) const;
+ bool are_connected(Id op, Id tensor) const;
+
+private:
+ AdjList _adj_src_tensors{};
+ AdjList _adj_dst_tensors{};
+ AdjList _adj_src_ops{};
+ AdjList _adj_dst_ops{};
+ std::map<Id, Id> _merge_to_internal{}; // From merge tensor to internal tensor
+ SerialIdAllocator _operator_id{};
+ SerialIdAllocator _tensor_id{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H \ No newline at end of file
diff --git a/arm_compute/core/experimental/IWorkload.h b/arm_compute/core/experimental/IWorkload.h
new file mode 100644
index 0000000000..942dbb70bb
--- /dev/null
+++ b/arm_compute/core/experimental/IWorkload.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+
+#include "arm_compute/core/experimental/DependencyGraph.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Describes when a Unit Workload is run.
+ *
+ */
+struct UnitWorkloadStage
+{
+ enum class Stage
+ {
+ Prepare, /**< Only run once at the beginning. */
+ Run, /**< Run every time after the first time. */
+ };
+ Stage stage;
+ friend bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+ {
+ return stage0.stage == stage1.stage;
+ }
+};
+/** Type of memory used by a Workload Tensor
+ *
+ */
+enum class MemoryType
+{
+ Core = 0, /**< Core memory used by the Workload Tensor, e.g. for argument tensors */
+ Auxiliary = 1, /**< Auxiliary memory required by the Workload Tensor, e.g. for temporary tensors */
+};
+
+using AuxMemoryLifetime = MemoryLifetime;
+
+/** Memory Info for a @ref WorkloadTensor of Auxiliary memory type. This communicates to the user how much additional
+ * memory is required for auxiliary tensors
+ */
+struct AuxMemoryInfo
+{
+ AuxMemoryInfo() = default;
+
+ AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
+ : size(size),
+ alignment(alignment)
+ {
+ }
+
+ AuxMemoryInfo(AuxMemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
+ : lifetime(lifetime),
+ size(size),
+ alignment(alignment)
+ {
+ }
+ friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
+ {
+ return info0.lifetime == info1.lifetime && info0.size == info1.size && info0.alignment == info1.alignment;
+ }
+
+ AuxMemoryLifetime lifetime{ AuxMemoryLifetime::Temporary }; /**< Memory lifetime*/
+ size_t size{ 0 }; /**< Total memory size in bytes */
+ size_t alignment{ 64 }; /**< Memory alignment in bytes */
+};
+
+/** A descriptor for IWorkload Tensors.
+ */
+struct WorkloadTensor
+{
+ using Id = DependencyGraph::Id;
+ Id id{}; /**< Id of the workload tensor */
+ ITensorInfo *info{}; /**< TensorInfo associated with the workload tensor */
+ MemoryType memory_type{}; /**< Memory type */
+ AuxMemoryInfo memory_info{}; /**< Auxiliary memory information. This can be ignored if the memory type is Core */
+};
+/** The basic atomic unit in an @ref IWorkload. It contains exactly one kernel to run.
+ *
+ */
+struct UnitWorkload
+{
+ using Id = DependencyGraph::Id;
+ Id id{}; /**< Id of the unit workload */
+ UnitWorkloadStage stage{}; /**< Stage */
+};
+
+/** Run-time-agnostic, platform-specific graph that describes everything required to run a workload
+ * It can be configured into an Arm Compute Library runtime, integrated into the runtime of another framework, or integrated into the compilation flow
+ */
+struct IWorkload
+{
+ using UnitWorkId = UnitWorkload::Id;
+ using Tid = WorkloadTensor::Id;
+ IWorkload() = default;
+ virtual ~IWorkload() = default;
+ DependencyGraph graph{}; /**< Dependency graph of the workload tensors and the unit workloads */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H \ No newline at end of file
diff --git a/arm_compute/core/experimental/OperatorGraph.h b/arm_compute/core/experimental/OperatorGraph.h
new file mode 100644
index 0000000000..621a719fe6
--- /dev/null
+++ b/arm_compute/core/experimental/OperatorGraph.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Graph of operators to execute within a Workload. This is a pure descriptive construct.
+ */
+class OperatorGraph final
+{
+public:
+ struct Implementation;
+ OperatorGraph();
+ ~OperatorGraph();
+
+public:
+ Implementation *impl();
+ const Implementation *impl() const;
+
+private:
+ std::unique_ptr<Implementation> _impl;
+};
+
+/** Return the validity of @p op_graph, usually after performing an operation (e.g. add_tensor) on it
+ *
+ * @param[in,out] op_graph OperatorGraph to be validated
+ *
+ * @return Status
+ */
+Status validate(const OperatorGraph &op_graph);
+
+/** Operator Tensor Handle
+ * This can be either an argument tensor, or an intermediate tensor linking 2 @ref Operator s
+ */
+class OpTensor final
+{
+public:
+ using Id = int;
+ OpTensor(Id id = {});
+ /** Id of the OpTensor
+ * @return Id
+ */
+ Id id() const;
+
+private:
+ Id _id{};
+};
+
+/** Provide order of @ref OpTensor by checking if @p t0 is "lower than" @p t1
+ *
+ * @param[in] t0 OpTensor
+ * @param[in] t1 OpTensor
+ *
+ * @return true if @p t0 is lower than @p t1
+ * @return false otherwise
+ */
+bool operator<(const OpTensor &t0, const OpTensor &t1);
+
+/** Associate a TensorInfo with a newly created @ref OpTensor in the @p graph.
+ *
+ * @note @p info needs to remain in scope and valid until the workload has finished building
+ * @note Can pass in an empty TensorInfo for a destination Tensor, in which case @p info will be inferred from the source tensors
+ *
+ * @param[in,out] graph OperatorGraph where the tensor is added
+ * @param[in] info TensorInfo to be associated
+ *
+ * @return OpTensor
+ */
+OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info);
+
+/** Operator Handle
+ * This can be used to further modify an existing operator
+ */
+class Operator final
+{
+public:
+ using Id = int;
+ Operator(Id id = {});
+ /** Id of the Operator
+ * @return Id
+ */
+ Id id() const;
+
+private:
+ Id _id{};
+};
+
+/** Provide order of @ref Operator by checking if @p op0 is "lower than" @p op1
+ *
+ * @param[in] op0 Operator
+ * @param[in] op1 Operator
+ *
+ * @return true if @p op0 is lower than @p op1
+ * @return false otherwise
+ */
+bool operator<(const Operator &op0, const Operator &op1);
+
+/** Padding information for 2D operations like Conv2dDescriptor
+ */
+struct Padding2D
+{
+ Padding2D() = default;
+ Padding2D(size_t left, size_t right, size_t top, size_t bottom)
+ : left(left), right(right), top(top), bottom(bottom)
+ {
+ }
+ size_t left = { 0 }; /**< Padding across the width dimension on the left, in elements. */
+ size_t right = { 0 }; /**< Padding across the width dimension on the right, in elements. */
+ size_t top = { 0 }; /**< Padding across the height dimension on the top, in elements. */
+ size_t bottom = { 0 }; /**< Padding across the height dimension on the bottom, in elements. */
+};
+
+/** Descriptor for Conv2dDescriptor operation
+ */
+struct Conv2dDescriptor
+{
+ /* TOSA compliant attribute parameters start */
+ Padding2D pad{};
+ Size2D stride{ 1U, 1U };
+ Size2D dilation{ 1U, 1U };
+ /* TOSA compliant attribute parameters end */
+ /* Non-TOSA compliant attribute parameters start */
+ /* Non-TOSA compliant attribute parameters end */
+};
+/** Add op Conv2d to @p graph
+ *
+ * @param[in,out] graph OperatorGraph where the operator is added to
+ * @param[in] desc Operator descriptor
+ * @param[in] input Input OpTensor
+ * @param[in] weights Weights OpTensor
+ * @param[in] bias (Optional) bias OpTensor
+ * @param[in] dst Destination OpTensor
+ *
+ * @return Operator
+ */
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst);
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst);
+/** (Only for Debuging and Testing) Force a conv2d method
+ *
+ * @param[in,out] graph OperatorGraph where conv2d op is located
+ * @param[in] conv2d Conv2d Op
+ * @param[in] method Forced ConvolutionMethod
+ */
+void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method);
+
+/** Descriptor for Addition operation
+ *
+ */
+struct AddDescriptor
+{
+ /* TOSA compliant attribute parameters start */
+ /* TOSA compliant attribute parameters end */
+ /* Non-TOSA compliant attribute parameters start */
+ /* Non-TOSA compliant attribute parameters end */
+};
+/** Add op Add to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
+ *
+ * @param[in,out] graph OperatorGraph where the operator is added to
+ * @param[in] desc Operator descriptor
+ * @param[in] lhs Lhs OpTensor
+ * @param[in] rhs Rhs OpTensor
+ * @param[in] dst Destination OpTensor
+ *
+ * @return Operator
+ */
+Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst);
+
+bool operator==(const OpTensor &t0, const OpTensor &t1);
+bool operator==(const Padding2D &pad0, const Padding2D &pad1);
+bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1);
+bool operator==(const AddDescriptor &, const AddDescriptor &);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH \ No newline at end of file
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index c8755dc26c..1995ab045e 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -41,20 +41,22 @@ enum TensorType : int32_t
ACL_SRC_DST = 0,
// Src
- ACL_SRC = 0,
- ACL_SRC_0 = 0,
- ACL_SRC_1 = 1,
- ACL_SRC_2 = 2,
- ACL_SRC_3 = 3,
- ACL_SRC_4 = 4,
- ACL_SRC_5 = 5,
- ACL_SRC_6 = 6,
+ ACL_SRC = 0,
+ ACL_SRC_0 = 0,
+ ACL_SRC_1 = 1,
+ ACL_SRC_2 = 2,
+ ACL_SRC_3 = 3,
+ ACL_SRC_4 = 4,
+ ACL_SRC_5 = 5,
+ ACL_SRC_6 = 6,
+ ACL_SRC_END = 6,
// Dst
- ACL_DST = 30,
- ACL_DST_0 = 30,
- ACL_DST_1 = 31,
- ACL_DST_2 = 32,
+ ACL_DST = 30,
+ ACL_DST_0 = 30,
+ ACL_DST_1 = 31,
+ ACL_DST_2 = 32,
+ ACL_DST_END = 32,
// Aux
ACL_INT = 50,
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 5bfaaf4b5d..3919635d1b 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -42,7 +42,6 @@ namespace experimental
{
namespace dynamic_fusion
{
-struct TensorBinding;
struct ClExecutionDescriptor;
} // namespace dynamic_fusion
} // namespace experimental
@@ -113,15 +112,13 @@ public:
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Schedule the execution of the passed kernel if possible.
- * Use TensorBinding instead of ITensorPack for working with dynamic fusion
- * @note Does not support dynamic tuning yet
*
* @param[in] kernel Kernel to execute.
* @param[in] tensors Map containing the tensors to operate on.
* @param[in] exec_desc Execution descriptor
* @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
*/
- void enqueue_op(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush = true);
+ void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush = true);
#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
@@ -218,7 +215,7 @@ private:
void flush_queue(bool flush);
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
- void enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush);
+ void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush);
#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Flag to ensure symbols initialisation is happening before Scheduler creation */
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index e595f8f34b..88933fc2d8 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -125,7 +125,7 @@ public:
void tune_kernel_dynamic(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
- void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override;
+ void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override;
#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Is the kernel_event set ?
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index a327497255..e0ee3ffe71 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -35,7 +35,6 @@ namespace experimental
{
namespace dynamic_fusion
{
-struct TensorBinding;
struct ClExecutionDescriptor;
} // namespace dynamic_fusion
} // namespace experimental
@@ -74,7 +73,7 @@ public:
* @param[in, out] tensors Tensors for the kernel to use
* @param[in] exec_desc Execution descriptor
*/
- virtual void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0;
+ virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0;
#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
};
} // namespace arm_compute
diff --git a/arm_compute/runtime/experimental/ClCompositeOperator.h b/arm_compute/runtime/experimental/ClCompositeOperator.h
new file mode 100644
index 0000000000..b903bc0ee6
--- /dev/null
+++ b/arm_compute/runtime/experimental/ClCompositeOperator.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IOperator.h"
+
+#include "arm_compute/core/experimental/ClWorkload.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Map OpTensor handles to their corresponding ITensor memory
+ */
+using OpTensorBinding = std::map<OpTensor, ITensor *>;
+
+/** Map a kernel (as identified by its unit workload id) to its corresponding tensor pack
+ *
+ * @note External user should not use the add_tensor_pack method to alter this tensor pack map, and should only use the map returned by @ref bind_tensors
+ */
+class TensorPackMap
+{
+public:
+ /** Find a tensor pack associated with the unit workload Id @p uwk_id
+ *
+ * @param[in] uwk_id unit workload Id associated with the tensor pack
+ *
+ * @return ITensorPack*
+ */
+ ITensorPack *find_tensor_pack(UnitWorkload::Id uwk_id);
+ /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
+ *
+ * @param[in] uwk_id unit workload Id associated with the tensor pack
+ *
+ * @return ITensorPack*
+ */
+ ITensorPack &get_tensor_pack(UnitWorkload::Id uwk_id);
+ /** Add a tensor pack and associate it with unit workload Id @p uwk_id
+ * @note Should not be used by external user
+ *
+ * @param[in] uwk_id unit workload Id associated with the tensor pack
+ * @param[in] tensor_pack Tensor Pack to be added
+ */
+ void add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack);
+
+private:
+ std::map<UnitWorkload::Id, ITensorPack> _tensor_packs{};
+};
+
+/** Holder of any auxiliary CLTensors required by a ClWorkload.
+ *
+ * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the TensorInfo and AuxMemoryInfo
+ *
+ * @note This data holder must remain valid until the ClCompositeOperator that it's passed to is out of scope
+ *
+ * @note External user should not use the add_aux_tensor method, and should only use the data returned by @ref bind_tensors
+ */
+class ClAuxTensorData
+{
+public:
+ /** A view of a single auxiliary data and the associated TensorInfo and AuxMemoryInfo
+ */
+ struct DataView
+ {
+ DataView() = default;
+ DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+ : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+ {
+ }
+ ~DataView() = default;
+ DataView(const DataView &other) = default;
+ DataView &operator=(const DataView &other) = default;
+ DataView(DataView &&other) = default;
+ DataView &operator=(DataView &&other) = default;
+ CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */
+ TensorInfo tensor_info{}; /**< Associated TensorInfo */
+ AuxMemoryInfo memory_info{}; /**< Memory requirement */
+ };
+
+ /** Add auxiliary tensor.
+ *
+ * @note Should not be used by external user
+ *
+ * @param[in] tensor_id Any Id that can uniquely identify an auxiliary tensor. Usually ClWorkloadTensor Id
+ * @param[in] tensor_info TensorInfo associated with the tensor
+ * @param[in] memory_info Memory requirements
+ *
+ * @return CLTensor* if successfully added, otherwise nullptr
+ */
+ CLTensor *add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info);
+
+ /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors.
+ *
+ * @return std::vector<DataView>&
+ */
+ std::vector<DataView> &get_tensors();
+
+private:
+ std::map<int, std::unique_ptr<CLTensor>> _owned_tensors{};
+ std::vector<DataView> _tensors{};
+};
+
+/** Bind tensor memory to packs used by prepare and run methods. Create auxiliary tensor objects and their memory requirements if needed
+ *
+ * @note This is the only method for external user to create ClAuxTensorData, and the prepare and run TensorPackMaps
+ *
+ * @param[out] aux_tensor_data Auxiliary Tensors required by the workload
+ * @param[out] prepare_pack_map TensorPackMap used by the prepare method
+ * @param[out] run_pack_map TensorPackMap used by the run method
+ * @param[in] workload ClWorkload to bind the tensors to
+ * @param[in] op_tensors CLTensor memory objects mapped from Core OpTensors
+ *
+ * @return Status
+ */
+Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors);
+
+/** Operator runtime to run a @ref ClWorkload
+ *
+ * @note User must explicitly call prepare before run otherwise run will fail.
+ *
+ */
+class ClCompositeOperator
+{
+public:
+ ClCompositeOperator();
+ ~ClCompositeOperator();
+ /** Configures a @ref ClCompositeOperator with a @ref ClWorkload
+ * This includes the compilation of Cl kernels inside the @ref ClWorkload
+ *
+ * @param[in] ctx CLCompileContext
+ * @param[in] workload ClWorkload to configure with
+ */
+ void configure(const CLCompileContext &ctx, const ClWorkload &workload);
+ /** Validate ClWorkload @p workload
+ *
+ * @param[in] workload ClWorkload to be validated
+ *
+ * @return Status
+ */
+ static Status validate(const ClWorkload &workload);
+ /** Enqueue prepare workloads
+ *
+ * @param tensor_pack_map Tensors required by the prepare workloads
+ */
+ void prepare(TensorPackMap &tensor_pack_map);
+ /** Enqueue run workloads
+ *
+ * @param tensor_pack_map Tensors required by the run workloads
+ */
+ void run(TensorPackMap &tensor_pack_map);
+
+private:
+ struct Implementation;
+ std::unique_ptr<Implementation> _impl;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H \ No newline at end of file
diff --git a/docs/DoxygenLayout.xml b/docs/DoxygenLayout.xml
index 69bdaf5c71..2d59dbe56c 100644
--- a/docs/DoxygenLayout.xml
+++ b/docs/DoxygenLayout.xml
@@ -19,7 +19,7 @@
<tab type="user" url="@ref adding_operator" title="How to Add a New Operator"/>
<tab type="user" url="@ref implementation_topic" title="Implementation Topics"/>
</tab>
- <tab type="pages" visible="no" title="" intro=""/>
+ <tab type="pages" visible="yes" title="Pages" intro=""/>
<tab type="modules" visible="yes" title="" intro=""/>
<tab type="namespaces" visible="yes" title="">
<tab type="namespacelist" visible="yes" title="" intro=""/>
diff --git a/examples/SConscript b/examples/SConscript
index 8ee688e76d..d456b7246c 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 Arm Limited.
+# Copyright (c) 2017-2022 Arm Limited.
#
# SPDX-License-Identifier: MIT
#
@@ -95,6 +95,15 @@ if env['opencl']:
prog = install_bin(prog)
alias = examples_env.Alias(example, prog)
Default(alias)
+ if env['experimental_dynamic_fusion']:
+ examples_env.Append(CPPDEFINES = ['ARM_COMPUTE_CL', 'ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
+ for file in Glob("./dynamic_fusion/*.cpp"):
+ example = os.path.basename(os.path.splitext(str(file))[0])
+ prog = examples_env.Program(example, ["./dynamic_fusion/{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs)
+ Depends(prog, arm_compute_dependency)
+ prog = install_bin(prog)
+ alias = examples_env.Alias(example, prog)
+ Default(alias)
if env['gemm_tuner'] and env['opencl']:
gemm_tuner_common_options = examples_env.Object("./gemm_tuner/CommonGemmExampleOptions.cpp")
diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
new file mode 100644
index 0000000000..6048024d30
--- /dev/null
+++ b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/// @example dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
+/// @copybrief example_dynamic_fusion_cl_conv2d_elementwise_add
+///
+/// @page example_dynamic_fusion_cl_conv2d_elementwise_add Dynamic Fusion Example: Conv2d + Elementwise Addition (OpenCL target)
+/// This example demonstrates how to fuse a Conv2d with an Addition using the new OperatorGraph API, and to run it with the Async Composite Operator
+
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
+#error "This example needs to be built with -DARM_COMPUTE_CL"
+#endif /* ARM_COMPUTE_CL */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/OperatorGraph.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "utils/TypePrinter.h"
+
+#include "utils/Utils.h"
+
+#include <cstdlib>
+
+using namespace arm_compute;
+using namespace utils;
+using namespace arm_compute::experimental::dynamic_fusion;
+
+#define TICK(clock_name) \
+ auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map) \
+ auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
+ measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
+ auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
+ measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+class ClFusedConv2dEltwiseAddExample : public Example
+{
+public:
+ bool do_setup(int argc, char **argv) override
+ {
+ size_t ih;
+ size_t iw;
+ size_t ifm;
+ size_t wh;
+ size_t ww;
+ size_t ofm;
+ size_t tuner_choice;
+ unsigned int pad_x;
+ unsigned int pad_y;
+ if(argc < 10)
+ {
+ // Print help
+ std::cout << "Usage: ./cl_fused_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n";
+ std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n";
+ ih = 512;
+ iw = 512;
+ ifm = 64;
+ wh = 1;
+ ww = 1;
+ ofm = 3;
+ tuner_choice = 2;
+ pad_x = 0;
+ pad_y = 0;
+ }
+ else
+ {
+ ih = strtol(argv[1], nullptr, 10);
+ iw = strtol(argv[2], nullptr, 10);
+ ifm = strtol(argv[3], nullptr, 10);
+ wh = strtol(argv[4], nullptr, 10);
+ ww = strtol(argv[5], nullptr, 10);
+ ofm = strtol(argv[6], nullptr, 10);
+ tuner_choice = strtol(argv[7], nullptr, 10);
+ pad_x = strtol(argv[8], nullptr, 10);
+ pad_y = strtol(argv[9], nullptr, 10);
+ }
+
+ CLTuner *tuner_to_use;
+ switch(tuner_choice)
+ {
+ case 0:
+ {
+ tuner_to_use = nullptr;
+ break;
+ }
+ case 1:
+ {
+ tuner.set_tuner_mode(CLTunerMode::RAPID);
+ tuner_to_use = &tuner;
+ break;
+ }
+ case 3:
+ {
+ tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE);
+ tuner_to_use = &tuner;
+ break;
+ }
+ case 2:
+ default:
+ {
+ tuner.set_tuner_mode(CLTunerMode::NORMAL);
+ tuner_to_use = &tuner;
+ break;
+ }
+ }
+ CLScheduler::get().default_init(tuner_to_use);
+
+ TICK(startup_time);
+ /* Computation:
+ * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+
+ const auto t_input_shape = TensorShape(ifm, iw, ih);
+ const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm);
+ const auto t_bias_shape = TensorShape(ofm);
+ const auto t_l1_addend_shape = TensorShape(ofm, iw);
+
+ std::cout << "input_shape: " << t_input_shape << std::endl;
+ std::cout << "weight_shape: " << t_weight_shape << std::endl;
+ std::cout << "bias_shape: " << t_bias_shape << std::endl;
+ std::cout << "addend_shape: " << t_l1_addend_shape << std::endl;
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// @section describe_workload_using_operator_graph Describe the workload to run using OperatorGraph
+ /// OperatorGraph is a graph of Tensors and Operators. Let's first default-construct it
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct OperatorGraph
+ // [Construct OperatorGraph]
+ OperatorGraph op_graph;
+ // [Construct OperatorGraph]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// @subsection add_conv2d Add the first operator (root operator) Conv2d
+ /// The first operator to be added to the graph is called the "root operator" of the entire graph.
+ /// @note As of now, operators need to be inserted according to their dependency order. This is because output tensor auto-initialization occurs during construction time.
+ /// Later this might be changed to allow out-of-order insertion.
+
+ /// Before we insert the operator, we need to initialize the required TensorInfo objects.
+ /// We can choose not to initialize an output TensorInfo; if so, they will be auto-initialized during the construction of the OperatorGraph
+ /// The "t_acc_info" is the TensorInfo of the accumulator tensor, which is the output tensor of our first operator conv2d
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize Conv2d TensorInfo
+ // [Initialize Conv2d TensorInfo]
+ auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
+ auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+ auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout);
+ auto t_acc_info = TensorInfo();
+ // [Initialize Conv2d TensorInfo]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// Next we associate the TensorInfo with the OpTensor s created in the op_graph.
+ /// @note The associated TensorInfo objects must be in scope and remain valid until the ClWorkload building is completed
+
+ /// @note The associated TensorInfo objects must be declard as non-const, since they may be updated during the OperatorGraph construction
+
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add OpTensors
+ // [Add OpTensors]
+ const auto op_t_input = add_tensor(op_graph, t_input_info);
+ const auto op_t_weight = add_tensor(op_graph, t_weight_info);
+ const auto op_t_bias = add_tensor(op_graph, t_bias_info);
+ const auto op_t_acc = add_tensor(op_graph, t_acc_info);
+ // [Add OpTensors]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// Finally we add the Conv2d operator to op_graph. The Conv2dDescriptor contains all the TOSA-compliant attribute parameters
+ /// The add_op... group of functions accept the OpTensors created by the add_tensor function, and return an Operator handle.
+ /// This handle can be used to further query and modify the operator inside the OperatorGraph after its creation
+ /// For example, here we use the handle to force the ConvolutionMethod to be Direct Convolution
+ /// @note The force_conv2d_method is only for debug purpose for now, as the end user is not expected to decide on the ConvolutionMethod
+
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Conv2d Operator
+ // [Add Conv2d Operator]
+ Conv2dDescriptor conv2d_desc{ Padding2D{ pad_x, pad_x, pad_y, pad_y } };
+ auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_bias, op_t_acc);
+ force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); // Only for debug purposes
+ // [Add Conv2d Operator]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// @subsection add_elementwise_add Add the second operator Elementwise Add
+ /// This is similar to adding the first operator to op_graph, except that we link the two operators together by their common tensor,
+ /// namely the accumulator tensor op_t_acc, which is the output of conv2d and the input (lhs) of the addition
+ /// @note At the moment, it is recommended to always declare a separate TensorInfo (even if empty) for each OpTensor.
+ /// For example, here op_t_dst could be associated with op_t_acc info as they are the same,
+ /// but we still recommend creating a separate object.
+
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Elementwise Add Operator
+ // [Add Elementwise Add Operator]
+ auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout);
+ auto t_dst_info = TensorInfo();
+ const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info);
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+ AddDescriptor add_desc{};
+ add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+ // [Add Elementwise Add Operator]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// @section build_clworkload Build ClWorkload
+ /// ClWorkload is an intermediate object which contains all the built kernel codes and all other descriptors on how to schedule them
+ /// We build ClWorkload from the op_graph object that we just described
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Build ClWorkload
+ // [Build ClWorkload]
+ const ClWorkloadContext workload_ctx
+ {
+ GpuInfo{ CLScheduler::get().target() }
+ };
+ ClWorkload workload;
+ build(workload, op_graph, workload_ctx);
+ // [Build ClWorkload]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// @section run_fused_op_with_clcompositeoperator Run the fused operator workload with ClCompositeOperator
+ /// @subsection configure_and_validate_clcompositeoperator Validate ClWorkload and Configure ClCompositeOperator
+ /// After ClWorkload is built, we need to configure it with the Compute Library runtime ClCompositeOperator to run it.
+ /// Optionally we can explicitly validate the workload to check if the workload has been built successfully.
+ /// The validate is automatically run inside configure and would throw if it fails.
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClCompositeOperator
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Validate and configure ClCompositeOperator
+ // [Validate and configure ClCompositeOperator]
+ const auto success = ClCompositeOperator::validate(workload); // Optional
+ op.configure(CLKernelLibrary::get().get_compile_context(), workload);
+ // [Validate and configure ClCompositeOperator]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// @subsection run_clcompositeoperator Run ClCompositeOperator
+ /// Construct the runtime CLTensor s with backing memory
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct CLTensor objects
+
+ /// Initialize, allocate and fill the CLTensor objects
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize, Allocate and Fill CLTensor objects
+ // [Initialize, Allocate and Fill CLTensor objects]
+ t_input.allocator()->init(t_input_info);
+ t_weight.allocator()->init(t_weight_info);
+ t_bias.allocator()->init(t_bias_info);
+ t_l1_addend.allocator()->init(t_dst_info);
+ t_dst.allocator()->init(t_dst_info);
+
+ t_input.allocator()->allocate();
+ t_weight.allocator()->allocate();
+ t_bias.allocator()->allocate();
+ t_l1_addend.allocator()->allocate();
+ t_dst.allocator()->allocate();
+
+ fill_random_tensor(t_input, -1.f, 1.f);
+ fill_random_tensor(t_weight, -1.f, 1.f);
+ fill_random_tensor(t_l1_addend, -1.f, 1.f);
+ // [Initialize, Allocate and Fill CLTensor objects]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// The OpTensorBinding creates a mapping from the OpTensor handles that we created early to the real CLTensors
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Create OpTensorBinding
+ // [Create OpTensorBinding]
+ OpTensorBinding op_tensors({ { op_t_input, &t_input },
+ { op_t_weight, &t_weight },
+ { op_t_bias, &t_bias },
+ { op_t_l1_addend, &t_l1_addend },
+ { op_t_dst, &t_dst }
+ });
+ // [Create OpTensorBinding]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// Bind the CLTensor objects to the prepare_pack_map and run_pack_map, which are used to prepare and run the op
+ /// This step additionally creates empty auxiliary CLTensor objects if any, and contain them inside a ClAuxTensorData aux_tensor_data
+ /// @note This step associates all the CLTensors contained in op_tensors and aux_tensor_data, with prepare_pack_map and run_pack_map
+ /// Make sure these CLTensors remain valid as long as the two pack_maps are still in use
+
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClAuxTensorData
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct TensorPackMaps
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Bind Tensors
+ // [Bind Tensors]
+ bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, op_tensors);
+ // [Bind Tensors]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// Initialize and Allocate Auxiliary CLTensor objects.
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize and Allocate Auxiliary CLTensor objects
+ // [Initialize and Allocate Auxiliary CLTensor objects]
+ for(auto tensor_data : aux_tensor_data.get_tensors())
+ {
+ tensor_data.tensor->allocator()->init(tensor_data.tensor_info);
+ tensor_data.tensor->allocator()->allocate();
+ }
+ // [Initialize and Allocate Auxiliary CLTensor objects]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like
+ /// reshaping tensors for a more performant format.
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Prepare ClCompositeOperator
+ // [Prepare ClCompositeOperator]
+ op.prepare(prepare_pack_map);
+ // [Prepare ClCompositeOperator]
+
+ /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+ /// At last, we run our operator
+ /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Run ClCompositeOperator
+ // [Run ClCompositeOperator]
+ op.run(run_pack_map);
+ // [Run ClCompositeOperator]
+ TOCK(startup_time, measurements);
+ return true;
+ }
+ void do_run() override
+ {
+ // Run the fused op
+ op.run(run_pack_map);
+
+ // Make sure all the OpenCL jobs are done executing:
+ CLScheduler::get().sync();
+ }
+
+ void do_teardown() override
+ {
+ for(auto m : measurements)
+ {
+ std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
+ }
+ }
+
+private:
+ // [Construct CLTensor objects]
+ CLTensor t_input{};
+ CLTensor t_weight{};
+ CLTensor t_bias{};
+ CLTensor t_l1_addend{};
+ CLTensor t_dst{};
+ // [Construct CLTensor objects]
+ // [Construct ClAuxTensorData]
+ ClAuxTensorData aux_tensor_data{};
+ // [Construct ClAuxTensorData]
+ // [Construct TensorPackMaps]
+ TensorPackMap prepare_pack_map{};
+ TensorPackMap run_pack_map{};
+ // [Construct TensorPackMaps]
+ // [Construct ClCompositeOperator]
+ ClCompositeOperator op{};
+ // [Construct ClCompositeOperator]
+ CLTuner tuner{};
+ std::map<std::string, std::chrono::microseconds> measurements{};
+};
+
+/** Main program for sgemm test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta )
+ */
+int main(int argc, char **argv)
+{
+ return utils::run_example<ClFusedConv2dEltwiseAddExample>(argc, argv);
+}
+
+#undef TICK
+#undef TOCK
+#undef TOCK_AVG \ No newline at end of file
diff --git a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp
new file mode 100644
index 0000000000..4f68372b49
--- /dev/null
+++ b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
+#error "This example needs to be built with -DARM_COMPUTE_CL"
+#endif /* ARM_COMPUTE_CL */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "utils/TypePrinter.h"
+#include "utils/Utils.h"
+
+#include <cstdlib>
+
+using namespace arm_compute;
+using namespace utils;
+
+#define TICK(clock_name) \
+ auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map) \
+ auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
+ measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
+ auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
+ measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+class ClRefConv2dEltwiseAddExample : public Example
+{
+public:
+ bool do_setup(int argc, char **argv) override
+ {
+ size_t ih;
+ size_t iw;
+ size_t ifm;
+ size_t wh;
+ size_t ww;
+ size_t ofm;
+ size_t tuner_choice;
+ unsigned int pad_x;
+ unsigned int pad_y;
+ if(argc < 10)
+ {
+ // Print help
+ std::cout << "Usage: ./cl_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive)\n";
+ std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n";
+ ih = 512;
+ iw = 512;
+ ifm = 64;
+ wh = 1;
+ ww = 1;
+ ofm = 3;
+ tuner_choice = 2;
+ pad_x = 0;
+ pad_y = 0;
+ }
+ else
+ {
+ ih = strtol(argv[1], nullptr, 10);
+ iw = strtol(argv[2], nullptr, 10);
+ ifm = strtol(argv[3], nullptr, 10);
+ wh = strtol(argv[4], nullptr, 10);
+ ww = strtol(argv[5], nullptr, 10);
+ ofm = strtol(argv[6], nullptr, 10);
+ tuner_choice = strtol(argv[7], nullptr, 10);
+ pad_x = strtol(argv[8], nullptr, 10);
+ pad_y = strtol(argv[9], nullptr, 10);
+ }
+
+ CLTuner *tuner_to_use;
+ switch(tuner_choice)
+ {
+ case 0:
+ {
+ tuner_to_use = nullptr;
+ break;
+ }
+ case 1:
+ {
+ tuner.set_tuner_mode(CLTunerMode::RAPID);
+ tuner_to_use = &tuner;
+ break;
+ }
+ case 3:
+ {
+ tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE);
+ tuner_to_use = &tuner;
+ break;
+ }
+ case 2:
+ default:
+ {
+ tuner.set_tuner_mode(CLTunerMode::NORMAL);
+ tuner_to_use = &tuner;
+ break;
+ }
+ }
+
+ CLScheduler::get().default_init(tuner_to_use);
+
+ TICK(startup_time);
+
+ /* Computation:
+ * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+ const PadStrideInfo conv_info{ 1, 1, pad_x, pad_y };
+ // const auto t_input_shape = TensorShape(384, 12, 12);
+ // const auto t_weight_shape = TensorShape(384, 1, 1, 64);
+ // const auto t_dst_shape = TensorShape(64, 12, 12);
+ const auto t_input_shape = TensorShape(ifm, iw, ih);
+ const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm);
+ const auto t_dst_shape = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info);
+ std::cout << "input_shape: " << t_input_shape << std::endl;
+ std::cout << "weight_shape: " << t_weight_shape << std::endl;
+ std::cout << "dst_shape: " << t_dst_shape << std::endl;
+ auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
+ auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+ auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Intermediate tensor for cond3
+ auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+ auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+
+ // Init tensors
+ {
+ t_input.allocator()->init(t_input_info);
+ t_weight.allocator()->init(t_weight_info);
+ t_l1_addend.allocator()->init(t_dst_info);
+ t_l0_dst.allocator()->init(t_l0_dst_info);
+ t_dst.allocator()->init(t_dst_info);
+ }
+
+ op0.configure(&t_input, &t_weight, nullptr, &t_l0_dst, conv_info);
+ op1.configure(&t_l0_dst, &t_l1_addend, &t_dst, ConvertPolicy{});
+
+ // Construct tensors
+ // Allocate and fill tensors
+ {
+ t_input.allocator()->allocate();
+ t_weight.allocator()->allocate();
+ t_l1_addend.allocator()->allocate();
+ t_l0_dst.allocator()->allocate();
+ t_dst.allocator()->allocate();
+ fill_random_tensor(t_input, -1.f, 1.f);
+ fill_random_tensor(t_weight, -1.f, 1.f);
+ fill_random_tensor(t_l1_addend, -1.f, 1.f);
+ }
+ // Dummy run for CLTuner
+ op0.run();
+ op1.run();
+ TOCK(startup_time, measurements);
+ return true;
+ }
+ void do_run() override
+ {
+ // Run the fused op
+ op0.run();
+ op1.run();
+
+ // Make sure all the OpenCL jobs are done executing:
+ CLScheduler::get().sync();
+ }
+
+ void do_teardown() override
+ {
+ for(auto m : measurements)
+ {
+ std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
+ }
+ }
+
+private:
+ CLTensor t_input{};
+ CLTensor t_weight{};
+ CLTensor t_l1_addend{};
+ CLTensor t_l0_dst{};
+ CLTensor t_dst{};
+ CLDirectConvolutionLayer op0{};
+ CLArithmeticAddition op1{};
+ CLTuner tuner{};
+ std::map<std::string, std::chrono::microseconds> measurements{};
+};
+
+/** Main program for sgemm test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta )
+ */
+int main(int argc, char **argv)
+{
+ return utils::run_example<ClRefConv2dEltwiseAddExample>(argc, argv);
+}
+
+#undef TICK
+#undef TOCK
+#undef TOCK_AVG \ No newline at end of file
diff --git a/filelist.json b/filelist.json
index 93dfdfff6e..dc4be05f58 100644
--- a/filelist.json
+++ b/filelist.json
@@ -2074,10 +2074,17 @@
"dynamic_fusion": [
"src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp",
"src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
- "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp"
+ "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp",
+
+ "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp",
+ "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp",
+ "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp"
]
}
}
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 046679e34e..d52b105507 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -349,7 +349,7 @@ public:
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/// The execution is carried out through run_op method. But the run_op method needs to be extended to include ClExecutionDescriptor as now LWS GWS tuning will be separated from the IKernel
- virtual void run_composite_op(experimental::dynamic_fusion::TensorBinding &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+ virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
{
ARM_COMPUTE_UNUSED(tensors, window, queue, exec_desc);
}
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
index 3e9ed060be..3d49dde5c8 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
@@ -49,69 +51,46 @@ const ClKernelBlueprint::Implementation &ClKernelBlueprint::impl() const
return *_impl;
}
-Status add_tensor_argument(ClKernelBlueprint &kernel_blueprint, const ClTensorDescriptor &tensor_desc, ArgumentID &id)
+Status add_tensor(ClKernelBlueprint &kernel_blueprint, ITensorInfo *tensor_info, ArgumentID &id, ArgumentID merge_point)
{
- id = kernel_blueprint.impl().add_kernel_argument(tensor_desc);
+ id = kernel_blueprint.impl().add_kernel_tensor(tensor_info, merge_point);
return Status{};
}
-Status add_tensor_intermed(ClKernelBlueprint &kernel_blueprint, ArgumentID &id)
-{
- id = kernel_blueprint.impl().add_intermediate_tensor();
- return Status{};
-}
-
-Status add_kcomp_gemm_native(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &,
- const GemmNativeDescriptor &gemm_native_desc,
- ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id)
-{
- kernel_blueprint.impl().validate_arg_ids({ lhs_id, rhs_id, bias_id, dst_id });
- kernel_blueprint.impl().add_component(
- std::make_unique<ClGemmNativeKernelComponent>(
- &kernel_blueprint,
- gemm_native_desc,
- SharedVarLink{ lhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(lhs_id) },
- SharedVarLink{ rhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(rhs_id) },
- SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
- SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) }));
-
- return Status{};
-}
-
-Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, const EltwiseAddDescriptor &,
+Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClEltwiseAddKernelDescriptor &,
ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
{
kernel_blueprint.impl().add_component(
std::make_unique<ClElementwiseAddKernelComponent>(
&kernel_blueprint,
- SharedVarLink{ src0_id, SharedVarIO::Input, kernel_blueprint.impl().group(src0_id) },
- SharedVarLink{ src1_id, SharedVarIO::Input, kernel_blueprint.impl().group(src1_id) },
- SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }));
+ SharedVarLink{ src0_id, SharedVarIO::Input },
+ SharedVarLink{ src1_id, SharedVarIO::Input },
+ SharedVarLink{ dst_id, SharedVarIO::Output }));
return Status{};
}
-Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID, ArgumentID &)
+Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &)
{
return Status{};
}
-Status add_kcomp_direct_conv(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &,
- const DirectConvolutionDescriptor &direct_conv2d_desc,
- ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id)
+Status add_kcomp_direct_conv2d(ClKernelBlueprint &kernel_blueprint,
+ const ClDirectConv2dKernelDescriptor &direct_conv2d_desc,
+ ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id)
{
kernel_blueprint.impl().add_component(
std::make_unique<ClDirectConvolutionKernelComponent>(
&kernel_blueprint,
direct_conv2d_desc,
- SharedVarLink{ src_id, SharedVarIO::Input, kernel_blueprint.impl().group(src_id) },
- SharedVarLink{ weight_id, SharedVarIO::Input, kernel_blueprint.impl().group(weight_id) },
- SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) },
- SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) }));
+ SharedVarLink{ src_id, SharedVarIO::Input },
+ SharedVarLink{ weight_id, SharedVarIO::Input },
+ SharedVarLink{ dst_id, SharedVarIO::Output },
+ SharedVarLink{ bias_id, SharedVarIO::Input }));
return Status{};
}
-Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, ArgumentID src_tile, ArgumentID dst_tile, const StoreType &store_type)
+Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const StoreType &store_type, ArgumentID src_tile, ArgumentID dst_tile)
{
switch(store_type)
{
@@ -119,15 +98,15 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon
kernel_blueprint.impl().add_component(
std::make_unique<ClStoreBlockBoundaryAwareKernelComponent>(
&kernel_blueprint,
- SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
- SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
+ SharedVarLink{ src_tile, SharedVarIO::Input },
+ SharedVarLink{ dst_tile, SharedVarIO::Output }));
break;
case StoreType::TStoreIndirectWidthSelect:
kernel_blueprint.impl().add_component(
std::make_unique<ClStoreIndirectWidthSelectKernelComponent>(
&kernel_blueprint,
- SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) },
- SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) }));
+ SharedVarLink{ src_tile, SharedVarIO::Input },
+ SharedVarLink{ dst_tile, SharedVarIO::Output }));
break;
default:
ARM_COMPUTE_ERROR("Store mode not yet supported.");
@@ -136,6 +115,11 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon
return Status{};
}
+Status update_merge_point(ClKernelBlueprint &bp, ArgumentID t_id, ArgumentID merge_point)
+{
+ return bp.impl().update_merge_point(t_id, merge_point);
+}
+
Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info)
{
bp.impl().set_tile_info(tile_info);
@@ -143,6 +127,7 @@ Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info)
}
Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &kernel_blueprint)
{
+ kernel_blueprint.impl().finalize();
code.name = kernel_blueprint.impl().build_kernel_name();
code.code = kernel_blueprint.impl().build_code();
@@ -153,12 +138,14 @@ Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint
return Status{};
}
+DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint)
+{
+ return blueprint.impl().get_graph();
+}
Status tune_static(ClExecutionDescriptor &, const ClKernelCode &)
{
return Status{};
}
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
index 23629f47bc..3dccdd7351 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
@@ -21,13 +21,18 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
#define ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
namespace arm_compute
{
@@ -35,46 +40,9 @@ namespace experimental
{
namespace dynamic_fusion
{
-using ArgumentID = int32_t;
+using ArgumentID = DependencyGraph::Id;
-static constexpr ArgumentID g_arg_placeholder = -1;
-
-/** Verbose and explicit way to enumerate all the tensor arguments variants used by
- * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
- */
-enum class TensorArgType : int
-{
- Scalar,
-
- Vector,
-
- Image,
- Image_Reinterpret_As_3D,
- Image_Export_To_ClImage2D,
-
- Image_3D, // 3D Tensor represented as a 2D Image + stride_z
- Image_3D_Export_To_ClImage2D,
-
- Tensor_3D,
- Tensor_4D,
-
- Tensor_4D_t_Buffer,
- Tensor_4D_t_Image
-};
-/** Describes all the info required to add a kernel argument at run time */
-struct ClKernelArgRuntimeDescriptor
-{
- ClKernelArgRuntimeDescriptor(int arg_id, TensorArgType type, bool slide_along_dimz = true)
- : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
- {
- }
- ~ClKernelArgRuntimeDescriptor() = default;
- int arg_id{ g_arg_placeholder }; // Arg ID in the blueprint
- TensorArgType tensor_arg_type{ TensorArgType::Image };
- bool slide_along_dimz{ true };
-};
-
-using ClKernelArgList = std::vector<ClKernelArgRuntimeDescriptor>;
+static constexpr ArgumentID g_arg_placeholder = DependencyGraph::empty_id();
/** Intermediate representation of the final, complete kernel source. */
class ClKernelBlueprint
@@ -93,145 +61,38 @@ public:
};
///// Kernel Components /////
-
-/** Meta information about all Cl Kernel Components */
-struct ClKernelComponentDescriptor
-{
- int32_t version{ 1 }; /**< Operator version */
-};
-
-/** Component: Tensor Argument */
-struct ClTensorDescriptor
-{
- ClTensorDescriptor(ITensorInfo *info)
- : tensor_info(info)
- {
- }
-
- ITensorInfo *tensor_info;
-};
-
-Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &);
-Status add_tensor_intermed(ClKernelBlueprint &, ArgumentID &);
-
-/** Component: Gemm Native */
-struct GemmNativeDescriptor
-{
- float alpha{};
- float beta{};
- unsigned int m{};
- unsigned int n{};
- unsigned int k{};
- unsigned int depth_output_gemm3d{};
- bool reinterpret_input_as_3d{};
- bool broadcast_bias{};
- bool fp_mixed_precision{};
- bool has_pad_y{};
- int nmult_transpose1xW_width{};
- int mult_interleave4x4_height{};
- GEMMLHSMatrixInfo lhs_info{};
- GEMMRHSMatrixInfo rhs_info{};
- int32_t a_offset{};
- int32_t b_offset{};
-};
-
-Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &,
- ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id);
-
/** Component: Eltwise Add */
-struct EltwiseAddDescriptor
-{
- ConvertPolicy convert_policy{ ConvertPolicy::SATURATE };
-};
-Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const EltwiseAddDescriptor &, ArgumentID src0_id,
+Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClEltwiseAddKernelDescriptor &, ArgumentID src0_id,
ArgumentID src1_id, ArgumentID &dst_id);
/** Component: Activation */
-struct ActivationDescriptor
-{
-};
-Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
+Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
/** Component: Direct Convolution **/
-struct DirectConvolutionDescriptor
-{
- PadStrideInfo pad_stride_info{};
-};
-Status add_kcomp_direct_conv(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const DirectConvolutionDescriptor &,
- ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id);
-
-enum class ClippingStrategy
-{
- TOP_LEFT,
- TOP_RIGHT,
- BOTTOM_LEFT,
- BOTTOM_RIGHT,
-};
+Status add_kcomp_direct_conv2d(ClKernelBlueprint &, const ClDirectConv2dKernelDescriptor &,
+ ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id);
-/** Component: Store */
-struct TileDescriptor
-{
- Size2D tile_dims{};
- Size2D boundaries{};
- ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT };
-
- TileDescriptor()
- {
- }
+Status add_kcomp_store(ClKernelBlueprint &, const StoreType &store_type, ArgumentID src_id, ArgumentID dst_id);
- TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip)
- : tile_dims(dims), boundaries(bound), clipping(clip)
- {
- }
-
- bool empty() const
- {
- return (tile_dims.area() == 0) || (boundaries.area() == 0);
- }
-};
-
-enum class StoreType
-{
- VStore,
- VStorePartial,
- StoreRow,
- ConvertStoreRow,
- StoreBlock,
- ConvertStoreBlock,
- StoreRowPartial,
- StoreBlockPartial,
- StoreBlockBoundaryAware,
- StoreVectorSelect,
- TStoreIndirectWidthSelect
-};
-
-Status add_kcomp_store(ClKernelBlueprint &, const ClKernelComponentDescriptor &, ArgumentID src_id, ArgumentID dst_id, const StoreType &store_type);
+Status add_tensor(ClKernelBlueprint &, ITensorInfo *, ArgumentID &, ArgumentID merge_point = DependencyGraph::empty_id());
///// Kernel Components /////
///// Building /////
-/** Information required for kernel compilation. The build results of KernelBlueprint */
-struct ClKernelCode
-{
- std::string name{}; /**< Kernel name */
- std::string code{}; /**< Kernel source code */
- std::string config_id{}; /**< Generated from blueprint based on complex component */
- CLBuildOptions build_options{}; /**< Kernel build options */
- Window window{}; /**< Execution window */
- ClKernelArgList arguments{}; /**< Kernel argument specficiations */
-
- bool operator==(const ClKernelCode &other) const
- {
- return name == other.name && code == other.code && build_options == other.build_options;
- }
-};
+/** Update existing merge tensor @p merge_point to point to @p t_id
+ *
+ * @param t_id
+ * @param merge_point
+ * @return Status
+ */
+Status update_merge_point(ClKernelBlueprint &, ArgumentID t_id, ArgumentID merge_point);
-/** GPU information for building the @ref ClKernelCode */
-struct GpuInfo
-{
- GPUTarget target{ GPUTarget::UNKNOWN };
-};
+/** Get dependency graph
+ *
+ * @return DependencyGraph
+ */
+DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint);
/** All information required for building the @ref ClKernelCode */
struct ClCodeBuilderContext
@@ -247,12 +108,6 @@ Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint
///// Building /////
///// Tuning /////
-struct ClExecutionDescriptor
-{
- cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */
- cl::NDRange gws{}; /**< Global work-group to be used */
- bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
-};
Status tune_static(ClExecutionDescriptor &, const ClKernelCode &);
@@ -261,6 +116,4 @@ Status tune_static(ClExecutionDescriptor &, const ClKernelCode &);
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
index aa27572746..17437c285d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
@@ -36,6 +38,7 @@
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include <iostream>
#include <queue>
#include <stack>
#include <string>
@@ -63,8 +66,8 @@ enum class SharedVarIO
enum class SharedVarGroup
{
- Argument, // Parameters to a kernel function
- Automatic // Automatic variables declared within the kernel body
+ Argument, // Parameters to a kernel function == dst or src tensors of the whole blueprint graph
+ Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph
};
/** Specifies a shared variable link for a component.
@@ -74,85 +77,151 @@ enum class SharedVarGroup
*/
struct SharedVarLink
{
- ArgumentID arg_id{ g_arg_placeholder };
- SharedVarIO io{ SharedVarIO::Input };
- SharedVarGroup group{ SharedVarGroup::Argument };
- bool is_empty() const
+ ArgumentID arg_id{ g_arg_placeholder };
+ SharedVarIO io{ SharedVarIO::Input };
+ bool is_empty() const
{
return arg_id == g_arg_placeholder;
}
};
/** A table of all the variables used in the kernel / blueprint
+ * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) **
+ *
* NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID
* NOTE: the variables all have the scope of the full kernel function
*/
class SharedVarTable
{
public:
+ /** A fully realized SharedVarLink
+ */
struct SharedVar
{
- SharedVarGroup group;
- std::string uniq_name; // Unique name, also the final variable name used in the built code
- ClKernelArgRuntimeDescriptor desc; // Automatic variables can and should still be described using this struct
+ ArgumentID arg_id{ g_arg_placeholder };
+ SharedVarIO io{ SharedVarIO::Input };
+ SharedVarGroup group{ SharedVarGroup::Argument };
+ std::string uniq_name{}; // Unique name, also the final variable name used in the built code
+ ClKernelArgDescriptor desc{}; // Automatic variables can and should still be described using this struct
+ bool is_empty() const
+ {
+ return arg_id == g_arg_placeholder;
+ }
};
- using Arguments = std::vector<SharedVar>;
+ class Arguments
+ {
+ public:
+ Arguments() = default;
+ void add_var(const SharedVar &var)
+ {
+ ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument);
+ _vars.push_back(var);
+ }
+ std::vector<SharedVar> get_all_vars() const
+ {
+ return _vars;
+ }
+ std::vector<SharedVar> get_src_vars() const
+ {
+ std::vector<SharedVar> src_vars;
+ std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var)
+ {
+ return var.io == SharedVarIO::Input;
+ });
+ return src_vars;
+ }
+ SharedVar get_dst_var() const
+ {
+ std::vector<SharedVar> dst_vars;
+ std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var)
+ {
+ return var.io == SharedVarIO::Output;
+ });
+ ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1);
+ return dst_vars.at(0);
+ }
+
+ private:
+ std::vector<SharedVar> _vars{};
+ };
- /** @note: The order of insertion is important. There is one precondition:
+ /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one
+ *
+ * @note: The order of insertion is important. There is one precondition:
* PRECOND: The components have been sorted topologically / is being traversed in topological order
* This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred
*/
- SharedVar add(SharedVarLink var_link, ClKernelArgRuntimeDescriptor runtime_desc, const std::string &name = "unnamed")
+ void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed")
{
ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected");
+ if(!get(var_link).is_empty())
+ {
+ return;
+ }
+
auto var_id = _num_var;
std::stringstream ss;
ss << name << "_" << var_id;
const auto uniq_name = ss.str();
- SharedVar var{ var_link.group, uniq_name, runtime_desc };
+ SharedVar var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc };
- if(var_link.group == SharedVarGroup::Argument)
+ if(group == SharedVarGroup::Argument)
{
_arguments.emplace(var_id, var);
+ _arg_id_map.emplace(var_link.arg_id, var_id);
_num_var++;
- _var_id_lut[var_link.arg_id] = var_id;
}
- else if(var_link.group == SharedVarGroup::Automatic)
+ else if(group == SharedVarGroup::Automatic)
{
- if(var_link.io == SharedVarIO::Output)
+ if(_global_vars.empty())
{
- _global_vars.emplace(var_id, var);
- _num_var++;
- _var_id_lut[var_link.arg_id] = var_id;
+ if(var_link.io == SharedVarIO::Output)
+ {
+ _global_vars.emplace(var_id, var);
+ _arg_id_map.emplace(var_link.arg_id, var_id);
+ _num_var++;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Component likely not traversed in topological order");
+ }
}
else
{
- // For the input link, the var (and thus its arg_id) will always have been added by the time we get here if we traverse components in topological order
- var = get_var(var_link.arg_id);
+ // Associate additional SharedVarLinks with the single global shared variable
+ const auto global_var_id = _global_vars.begin()->first;
+ _arg_id_map[var_link.arg_id] = global_var_id;
}
}
else
{
ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup");
}
- return var;
}
- SharedVar get_var(ArgumentID arg_id) const
+ /** Get the SharedVar associated with @p var_link
+ *
+ * @param var_link
+ * @return SharedVar
+ */
+ SharedVar get(const SharedVarLink &var_link) const
{
- const auto var_id = _var_id_lut.at(arg_id); // arg_id has to exist in lut to begin with
- auto it = _global_vars.find(var_id);
- if(it != _global_vars.end())
- {
- return it->second;
- }
- it = _arguments.find(var_id);
- if(it != _arguments.end())
+ const SharedVar empty_var{};
+ if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end())
{
- return it->second;
+ const auto var_id = _arg_id_map.at(var_link.arg_id);
+ const auto arg_var = _arguments.find(var_id);
+ if(arg_var != _arguments.end())
+ {
+ return arg_var->second;
+ }
+ else
+ {
+ return _global_vars.at(var_id);
+ }
}
- ARM_COMPUTE_ERROR("Cannot find component variable");
+ return empty_var;
}
/** @note The arguments are returned in the order they are added
@@ -162,7 +231,7 @@ public:
Arguments args{};
for(const auto &a : _arguments)
{
- args.push_back(a.second);
+ args.add_var(a.second);
}
return args;
}
@@ -171,9 +240,9 @@ private:
using VarID = int32_t;
private:
- std::map<VarID, SharedVar> _global_vars{};
- std::map<VarID, SharedVar> _arguments{};
- std::unordered_map<ArgumentID, VarID> _var_id_lut{};
+ std::map<VarID, SharedVar> _global_vars{}; // Shared, global variable
+ std::map<VarID, SharedVar> _arguments{};
+ std::map<ArgumentID, VarID> _arg_id_map{}; // Track ArgumentIDs that have already been added
VarID _num_var{ 0 };
};
@@ -184,7 +253,7 @@ enum class ComponentType
Store
};
-using ComponentID = int32_t;
+using ComponentID = DependencyGraph::Id;
using ComponentList = std::vector<ComponentID>;
class IClKernelComponent
{
@@ -224,7 +293,7 @@ public:
};
using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
public:
- IClKernelComponent(const ClKernelBlueprint *blueprint)
+ IClKernelComponent(ClKernelBlueprint *blueprint)
: _blueprint(blueprint)
{
}
@@ -304,12 +373,18 @@ public:
{
return Window{};
}
- /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code
+ /** Get the tag look-up table used to instantiate the component code.
*
* @param vtable
* @return TagLUT
*/
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const = 0;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0;
+
+ /** Allocate all shared variables used by the component in the @p vtable
+ *
+ * @param vtable
+ */
+ virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0;
virtual std::string get_dst_addr_calculation() const
{
@@ -331,7 +406,7 @@ public:
}
protected:
- const ClKernelBlueprint *_blueprint;
+ ClKernelBlueprint *_blueprint;
private:
ComponentID _id{};
@@ -348,18 +423,19 @@ public:
~Implementation() = default;
public:
- ArgumentID add_kernel_argument(const ClTensorDescriptor &tensor_desc)
+ Status update_merge_point(ArgumentID t_id, ArgumentID merge_point)
{
- _kernel_arguments.insert(std::make_pair(_num_args, tensor_desc));
- _shared_var_group_lut[_num_args] = SharedVarGroup::Argument;
- return _num_args++;
+ return _graph.update_merge_point(t_id, merge_point);
}
- ArgumentID add_intermediate_tensor()
+ ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id())
{
- _intermediate_tensors.insert(_num_args);
- _shared_var_group_lut[_num_args] = SharedVarGroup::Automatic;
- return _num_args++;
+ const auto id = _graph.add_tensor(merge_point);
+ if(_kernel_tensors.find(id) == _kernel_tensors.end())
+ {
+ _kernel_tensors.insert(std::make_pair(id, tensor_info));
+ }
+ return id;
}
void set_tile_info(const TileDescriptor &tile_info)
@@ -382,7 +458,7 @@ public:
for(const auto arg_id : args)
{
ARM_COMPUTE_UNUSED(arg_id);
- ARM_COMPUTE_ERROR_ON_MSG(_kernel_arguments.find(arg_id) == _kernel_arguments.end() && _intermediate_tensors.find(arg_id) == _intermediate_tensors.end() && arg_id != g_arg_placeholder,
+ ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder,
"Trying to use an argument that hasn't been added to the blueprint");
}
}
@@ -395,29 +471,36 @@ public:
ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported.");
}
- // This flag specifies if the current component is the root of the component graph
- // If the root is set to -1, it means that a root hasn't been added yet
- bool is_graph_root = true;
-
// Get an unique ID for the component that's being added
- const ComponentID component_id = _num_components++;
+ std::vector<ArgumentID> src_tensors;
+ std::vector<ArgumentID> dst_tensors;
+ for(const auto &link : component->get_links())
+ {
+ if(link.is_empty())
+ {
+ continue;
+ }
+ if(link.io == SharedVarIO::Input)
+ {
+ src_tensors.push_back(link.arg_id);
+ }
+ else
+ {
+ dst_tensors.push_back(link.arg_id);
+ }
+ }
+ const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second;
component->set_id(component_id);
// Add this component to the component graph. Don't connect it to anything yet
_component_graph.emplace(component_id, ComponentList{});
- int32_t positional_arg = 0;
-
// For every { arg_id, arg_io } passed along with this component...
for(const auto &link : component->get_links())
{
const ArgumentID &arg_id = link.arg_id;
const SharedVarIO &arg_io = link.io;
- // A component is considered root only if all its input arguments are kernel arguments (or placeholders, which means nullptr)
- // This performs a check on every argument, and if one of them doesn't respect the condition, the component is not considered root
- is_graph_root &= (_kernel_arguments.find(arg_id) != _kernel_arguments.end()) || (arg_io == SharedVarIO::Output) || (arg_id == g_arg_placeholder);
-
// Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there
if(_outgoing_components.find(arg_id) == _outgoing_components.end())
{
@@ -454,15 +537,9 @@ public:
_incoming_components[arg_id].push_back(component_id);
}
-
- ++positional_arg;
}
- if(is_graph_root)
- {
- ARM_COMPUTE_ERROR_ON_MSG(_graph_root >= 0, "Trying to add more than one root to the graph");
- _graph_root = component_id;
- }
+ ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph");
// Finally, add this component to the dictionary of components
_components.insert(std::make_pair(component_id, std::move(component)));
@@ -489,17 +566,28 @@ public:
std::set<std::string> additional_macros{};
std::vector<std::string> component_codes{}; // vector because order matters
- // Go through the components graph (topological sort) and fill the data structures above
+ // Step 1: Allocate all kernel argument shared variables before generating the component code
auto stack = topological_sort();
while(!stack.empty())
{
auto curr_component_id = stack.top();
auto &curr_component = _components.find(curr_component_id)->second;
+ curr_component->allocate_shared_vars(_vtable);
+
+ stack.pop();
+ }
+ // Step 2: Generate component codes
+ stack = topological_sort();
+ while(!stack.empty())
+ {
+ auto curr_component_id = stack.top();
+ auto &curr_component = _components.find(curr_component_id)->second;
+
auto curr_headers_list = curr_component->get_headers_list();
auto curr_additional_macros = curr_component->get_additional_macros();
auto curr_component_code = curr_component->get_component_code();
- const auto var_lut = curr_component->allocate_vars(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+ const auto var_lut = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut));
headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
@@ -511,7 +599,7 @@ public:
stack.pop();
}
- // This section assembles the data gathered by traversing the graph into the string "code"
+ // Step 3: Assemble the data gathered by traversing the graph into the string "code"
std::string code = "";
for(auto &header : headers_list)
@@ -596,34 +684,79 @@ public:
ClKernelArgList get_arguments() const
{
ClKernelArgList arg_list{};
- for(const auto &arg_var : _vtable.get_kernel_arguments())
+ for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars())
{
- arg_list.push_back(arg_var.desc);
+ arg_list[arg_var.desc.arg_id] = arg_var.desc;
}
return arg_list;
}
- const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const
+ /** Get the arguments as shared vars from the vtable
+ *
+ * @return SharedVarTable::Arguments
+ */
+ SharedVarTable::Arguments get_argument_shared_vars() const
+ {
+ return _vtable.get_kernel_arguments();
+ }
+
+ const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
{
- auto it = _kernel_arguments.find(id);
- if(it != _kernel_arguments.end())
+ auto it = _kernel_tensors.find(id);
+ if(it != _kernel_tensors.end())
{
- return &_kernel_arguments.find(id)->second;
+ return it->second;
}
return nullptr;
}
- ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
+ ITensorInfo *get_kernel_argument_info(const ArgumentID id)
{
- const ClTensorDescriptor *arg_desc = get_kernel_argument(id);
- if(arg_desc != nullptr)
+ auto it = _kernel_tensors.find(id);
+ if(it != _kernel_tensors.end())
{
- return arg_desc->tensor_info;
+ return it->second;
}
return nullptr;
}
+ /** Finalize graph construction. Graph is expected to not mutate after being finalized
+ */
+ void finalize()
+ {
+ cache_root_component();
+ assign_shared_var_group();
+ }
+
+ DependencyGraph get_graph() const
+ {
+ return _graph;
+ }
private:
+ void cache_root_component()
+ {
+ const auto roots = _graph.get_root_ops();
+ ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph");
+ _graph_root = roots.at(0);
+ }
+ /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building
+ */
+ void assign_shared_var_group()
+ {
+ for(const auto &tensor : _kernel_tensors)
+ {
+ const auto tensor_id = tensor.first;
+ if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id))
+ {
+ _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument;
+ }
+ else
+ {
+ _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic;
+ }
+ }
+ }
+
void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
{
visited.insert(component_id);
@@ -666,41 +799,41 @@ private:
std::string code;
switch(var.desc.tensor_arg_type)
{
- case TensorArgType::Vector:
+ case ClKernelTensorArgType::Vector:
{
code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")";
break;
}
- case TensorArgType::Image:
+ case ClKernelTensorArgType::Image:
{
code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")";
break;
}
- case TensorArgType::Image_3D:
+ case ClKernelTensorArgType::Image_3D:
{
code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),";
code += "\n uint " + var.uniq_name + "_stride_z";
break;
}
- case TensorArgType::Image_3D_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
{
code += "\n __read_only image2d_t " + var.uniq_name + "_img,";
code += "\n uint " + var.uniq_name + "_stride_z";
break;
}
- case TensorArgType::Tensor_4D_t_Buffer:
+ case ClKernelTensorArgType::Tensor_4D_t_Buffer:
{
code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
break;
}
- case TensorArgType::Tensor_4D_t_Image:
+ case ClKernelTensorArgType::Tensor_4D_t_Image:
{
code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
break;
}
default:
{
- ARM_COMPUTE_ERROR("Unsupported declaration generation for TensorArgType");
+ ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType");
}
}
return code;
@@ -710,7 +843,7 @@ private:
{
std::string code = "\n__kernel void " + build_kernel_name() + "(";
- for(const auto &arg : argument_list)
+ for(const auto &arg : argument_list.get_all_vars())
{
code += generate_argument_declaration(arg) + ",";
}
@@ -722,54 +855,55 @@ private:
std::string generate_global_section() const
{
- std::string code = "";
- code += " uint g_x = get_global_id(0);\n";
- code += " uint g_y = get_global_id(1);\n";
- code += " uint g_z = get_global_id(2);\n\n";
+ auto dst_info = get_kernel_argument_info(_dst_id);
+ auto dst_w = dst_info->dimension(0);
+ auto dst_h = dst_info->dimension(1);
+ const auto tile_w = std::max(1, get_execution_window().x().step());
+ const auto tile_h = std::max(1, get_execution_window().y().step());
+ auto leftover_w = dst_w % tile_w;
+ auto leftover_h = dst_h % tile_h;
- size_t tile_dim_x = _tile_info.empty() ? 1 : _tile_info.tile_dims.x();
- size_t tile_dim_y = _tile_info.empty() ? 1 : _tile_info.tile_dims.y();
+ std::string code = "";
+ code += std::string(" int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+ code += std::string(" int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + std::to_string(leftover_h) + ");\n";
+ code += std::string(" int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n");
switch(_tile_info.clipping)
{
case ClippingStrategy::TOP_LEFT:
- code += " const bool g_cond_x = (g_x == 0);\n";
- code += " const bool g_cond_y = (g_y == 0);\n";
+ code += " const bool g_cond_x = (cout == 0);\n";
+ code += " const bool g_cond_y = (mout == 0);\n";
break;
case ClippingStrategy::TOP_RIGHT:
- code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
- code += " const bool g_cond_y = (g_y == 0);\n";
+ code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
+ code += " const bool g_cond_y = (mout == 0);\n";
break;
case ClippingStrategy::BOTTOM_LEFT:
- code += " const bool g_cond_x = (g_x == 0);\n";
- code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
+ code += " const bool g_cond_x = (cout == 0);\n";
+ code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
break;
case ClippingStrategy::BOTTOM_RIGHT:
- code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
- code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
+ code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
+ code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
break;
default:
ARM_COMPUTE_ERROR("Unsupported clipping strategy");
}
- code += "\n REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n";
- code += " REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n";
-
return code;
}
TileDescriptor _tile_info{};
- int32_t _num_args{};
- int32_t _num_components{};
int32_t _num_complex_components{};
ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
- // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate)
+ DependencyGraph _graph{};
+
+ // Tensors, components and IDs with corresponding ptrs (except intermediate)
std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
- std::unordered_map<ArgumentID, ClTensorDescriptor> _kernel_arguments{};
- std::unordered_set<ArgumentID> _intermediate_tensors{};
+ std::unordered_map<ArgumentID, ITensorInfo *> _kernel_tensors{};
// Argument group lookup. Can be replaced by extending the ArgumentID type to include group info
std::unordered_map<ArgumentID, SharedVarGroup> _shared_var_group_lut{};
@@ -794,6 +928,4 @@ private:
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
index 41ab4e320b..d4feac7da9 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
@@ -72,6 +74,4 @@ inline std::string to_string(const ClKernelCode &code)
} // namespace experimental
} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
index f951ce3d46..11fb1d53d0 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
@@ -31,6 +33,7 @@
#include "src/core/helpers/WindowHelpers.h"
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
namespace arm_compute
{
namespace experimental
@@ -44,7 +47,7 @@ ComponentType ClDirectConvolutionKernelComponent::get_component_type() const
std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const
{
- return std::set<std::string> { "helpers.h", "tile_helpers.h", "repeat.h" };
+ return std::set<std::string> { "helpers.h", "tile_helpers.h" };
}
Window ClDirectConvolutionKernelComponent::get_window() const
@@ -54,7 +57,17 @@ Window ClDirectConvolutionKernelComponent::get_window() const
auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
// Get dst shape
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info);
+ PadStrideInfo pad_stride_info
+ {
+ static_cast<unsigned int>(_desc.conv2d.stride.x()),
+ static_cast<unsigned int>(_desc.conv2d.stride.y()),
+ static_cast<unsigned int>(_desc.conv2d.pad.left),
+ static_cast<unsigned int>(_desc.conv2d.pad.right),
+ static_cast<unsigned int>(_desc.conv2d.pad.top),
+ static_cast<unsigned int>(_desc.conv2d.pad.bottom),
+ DimensionRoundingType::FLOOR /*default rounding type*/
+ };
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info);
// Output auto initialization if not yet initialized
auto_init_if_empty(*dst_info, output_shape,
@@ -64,6 +77,9 @@ Window ClDirectConvolutionKernelComponent::get_window() const
const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
+ // const unsigned int num_rows = 1;
+ // const unsigned int vec_size = tile_info.tile_dims.x();
+ // const unsigned int num_rows = tile_info.tile_dims.y();
// Create and configure kernel window
Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
@@ -95,27 +111,30 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
//------------------ START KERNEL {{meta_kernel_id}} ---------------------
// IN_0(src) {{src}}
// IN_1(wei) {{weight}}
+ )_";
+ if(bias_info != nullptr)
+ {
+ code += R"_(
// IN_1(bia) {{bias}}
+ )_";
+ }
+ code += R"_(
// OUT(dst, accum) {{dst}}
- const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
- const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
// Initialize the accumulators
TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
{
// All the tensor dimensions are passed at compile time.
// In case of dynamic tensor support, the following dimensions should be passed as function argument.
- #define _I{{WEI_WIDTH}} {{WEI_WIDTH}}
- #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}}
+ #define _IWEI_WIDTH {{WEI_WIDTH}}
+ #define _IWEI_HEIGHT {{WEI_HEIGHT}}
#define _ISRC_WIDTH {{src}}_w
#define _ISRC_HEIGHT {{src}}_h
#define _ISRC_CHANNELS {{src}}_c
- #define _IDST_WIDTH {{dst_w}}
- #define _IDST_HEIGHT {{dst_h}}
- #define _IDST_CHANNELS {{dst_c}}
- #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}})
+ #define _IDST_WIDTH {{arg_dst}}_w
+ #define _IDST_HEIGHT {{arg_dst}}_h
+ #define _IDST_CHANNELS {{arg_dst}}_c
+ #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
// .v = access the whole vector (OpenCL vector)
// .s[x] = access the vector element at position x (scalar access)
@@ -136,13 +155,11 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
{{dst}}[i].v = 0;
})
- uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0);
-
- for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i)
+ for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
{
int ck = 0;
- int xk = i % _I{{WEI_WIDTH}};
- int yk = i / _I{{WEI_WIDTH}};
+ int xk = i % _IWEI_WIDTH;
+ int yk = i / _IWEI_HEIGHT;
int k = 0;
for(; k <= (_ISRC_CHANNELS - K0); k += K0)
@@ -201,6 +218,16 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
}
code += R"_(
+ #undef _I_WEI_WIDTH
+ #undef _I_WEI_HEIGHT
+ #undef _ISRC_WIDTH
+ #undef _ISRC_HEIGHT
+ #undef _ISRC_CHANNELS
+ #undef _IDST_WIDTH
+ #undef _IDST_HEIGHT
+ #undef _IDST_CHANNELS
+ #undef _IY_MULTIPLIER
+
}
)_";
@@ -217,44 +244,7 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const
}
code += R"_(
- #undef _I{{WEI_WIDTH}}
- #undef _I{{WEI_HEIGHT}}
- #undef _ISRC_WIDTH
- #undef _ISRC_HEIGHT
- #undef _ISRC_CHANNELS
- #undef _IDST_WIDTH
- #undef _IDST_HEIGHT
- #undef _IDST_CHANNELS
- #undef _IY_MULTIPLIER
}
-
- // Workaround for the discrepancy between tiles and repeats
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v;
-#if M0 >= 2
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v;
-#endif // M0 >= 2
-#if M0 >= 3
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v;
-#endif // M0 >= 3
-#if M0 >= 4
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v;
-#endif // M0 >= 4
-#if M0 >= 8
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v;
-#endif // M0 >= 8
-#if M0 == 16
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v;
- VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v;
-#endif // M0 == 16
//------------------ END KERNEL {{meta_kernel_id}} ---------------------
)_";
return code.c_str();
@@ -306,19 +296,18 @@ bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target,
CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const
{
const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+ auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ // const auto tile_info = _blueprint->impl().get_tile_info();
const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
const DataType data_type = src_info->data_type();
- const GPUTarget gpu_target = ICLKernel().get_target();
-
- Window win = get_window();
+ const GPUTarget gpu_target = CLScheduler::get().target();
- const unsigned int n0 = win.x().step();
- const unsigned int m0 = win.y().step();
+ const unsigned int n0 = _blueprint->impl().get_execution_window().x().step();
+ const unsigned int m0 = _blueprint->impl().get_execution_window().y().step();
const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
- const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0;
+ const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
// Update the padding for the weights tensor if we can export to cl_image
@@ -338,54 +327,79 @@ CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() cons
return build_opts;
}
-ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+ const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
+ const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
+
+ vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
+ const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer;
+ vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight");
+
+ if(!_bias.is_empty()) // optional bias
+ {
+ vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias");
+ }
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
{
TagLUT lut{};
const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
- const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- const GPUTarget gpu_target = ICLKernel().get_target();
- const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
- const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer;
- lut["meta_kernel_id"] = id();
- lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src");
- lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight");
+ // Arguments and global shared variables
+ lut["src"] = vtable.get(_src);
+ lut["weight"] = vtable.get(_weight);
if(!_bias.is_empty()) // optional bias
{
- lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias");
+ lut["bias"] = vtable.get(_bias);
lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
}
- lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
-
- // Local build options
- const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
- const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
- const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
+ lut["dst"] = vtable.get(_dst);
- lut["dst_w"] = dst_info->dimension(width_idx);
- lut["dst_h"] = dst_info->dimension(height_idx);
- lut["dst_c"] = dst_info->dimension(channel_idx);
+ const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var();
+ lut["arg_dst"] = dst_argument.uniq_name;
- lut["ACC_DATA_TYPE"] = src_info->data_type();
- lut["SRC_DATA_TYPE"] = src_info->data_type();
- lut["WEI_DATA_TYPE"] = weight_info->data_type();
+ // Local build options
+ lut["meta_kernel_id"] = id();
+ lut["ACC_DATA_TYPE"] = src_info->data_type();
+ lut["SRC_DATA_TYPE"] = src_info->data_type();
+ lut["WEI_DATA_TYPE"] = weight_info->data_type();
lut["SRC_TENSOR_TYPE"] = "BUFFER";
- lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER";
-
- lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
- lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
+ switch(vtable.get(_weight).desc.tensor_arg_type)
+ {
+ case ClKernelTensorArgType::Image_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Tensor_4D_t_Image:
+ {
+ lut["WEI_TENSOR_TYPE"] = "IMAGE";
+ break;
+ }
+ default:
+ {
+ lut["WEI_TENSOR_TYPE"] = "BUFFER";
+ break;
+ }
+ }
+ const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
+ const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
+ lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
+ lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
- lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride());
- lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride());
+ lut["STRIDE_X"] = _desc.conv2d.stride.x();
+ lut["STRIDE_Y"] = _desc.conv2d.stride.y();
- lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left();
- lut["PAD_TOP"] = _desc.pad_stride_info.pad_top();
+ lut["PAD_LEFT"] = _desc.conv2d.pad.left;
+ lut["PAD_TOP"] = _desc.conv2d.pad.top;
lut["ZERO_VALUE"] = 0;
@@ -393,6 +407,4 @@ ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::a
}
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
index 10c0e00a58..af9a65debc 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
@@ -39,7 +41,7 @@ namespace dynamic_fusion
class ClDirectConvolutionKernelComponent : public IClKernelComponent
{
public:
- ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc,
+ ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc,
const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{})
: IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst }
{
@@ -58,7 +60,8 @@ public:
return { _src, _weight, _bias, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::string name() const override
{
@@ -66,16 +69,14 @@ public:
}
private:
- DirectConvolutionDescriptor _desc{};
- Link _src{};
- Link _weight{};
- Link _bias{};
- Link _dst{};
+ ClDirectConv2dKernelDescriptor _desc{};
+ Link _src{};
+ Link _weight{};
+ Link _bias{};
+ Link _dst{};
};
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
index 84e4003d5d..2bbea8725d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
#include "arm_compute/core/Validate.h"
@@ -41,7 +43,7 @@ ComponentType ClElementwiseAddKernelComponent::get_component_type() const
std::set<std::string> ClElementwiseAddKernelComponent::get_headers_list() const
{
- return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" };
+ return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
}
Window ClElementwiseAddKernelComponent::get_window() const
@@ -67,63 +69,62 @@ Window ClElementwiseAddKernelComponent::get_window() const
std::string ClElementwiseAddKernelComponent::get_component_code() const
{
std::string code;
- return R"_(
+ const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+
+ if(is_root)
+ {
+ return R"_(
//------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
- // IN_0(Accumulator) {{acc}}
- // IN_1(Addend) {{addend}}
+ // IN_0(LHS) {{lhs}}
+ // IN_1(RHS) {{rhs}}
+ // OUT(dst, accum) {{dst}}
- // c = addend + c (mix-precision, broadcast, boundary aware)
+ // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
+ TILE({{DATA_TYPE}}, M0, N0, {{dst}});
{
- __global uchar *addend_addr = {{addend}}_ptr + {{addend}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{addend}}_stride_y) + get_global_id(2) * {{addend}}_stride_z; \
- LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \
- MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp);
- }
+ TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
+ TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
- // Workaround for the discrepancy between tiles and repeats
-#if defined(IS_TILED)
- {{acc}}[0].v = {{acc}}0;
-#if M0 >= 2
- {{acc}}[1].v = {{acc}}1;
-#endif // M0 >= 2
-#if M0 >= 3
- {{acc}}[2].v = {{acc}}2;
-#endif // M0 >= 3
-#if M0 >= 4
- {{acc}}[3].v = {{acc}}3;
-#endif // M0 >= 4
-#if M0 >= 8
- {{acc}}[4].v = {{acc}}4;
- {{acc}}[5].v = {{acc}}5;
- {{acc}}[6].v = {{acc}}6;
- {{acc}}[7].v = {{acc}}7;
-#endif // M0 >= 8
-#if M0 == 16
- {{acc}}[8].v = {{acc}}8;
- {{acc}}[9].v = {{acc}}9;
- {{acc}}[10].v = {{acc}}A;
- {{acc}}[11].v = {{acc}}B;
- {{acc}}[12].v = {{acc}}C;
- {{acc}}[13].v = {{acc}}D;
- {{acc}}[14].v = {{acc}}E;
- {{acc}}[15].v = {{acc}}F;
-#endif // M0 == 16
-#endif // defined(IS_TILED)
+ T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile);
+ T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile);
+
+ T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
+ }
//------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+)_";
+ }
+ else
+ {
+ return R"_(
+ //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
+ // IN_0/Out(Accumulator) {{acc}}
+ // IN_1(Addend) {{addend}}
+ // acc = addend + acc (mix-precision, broadcast, boundary aware)
+ {
+ TILE({{DATA_TYPE}}, M0, N0, addend_tile);
+
+ T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile);
+
+ T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
+ }
+ //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD ---------------------
)_";
+ }
}
CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const
{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- auto tile_info = _blueprint->impl().get_tile_info();
+ const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
CLBuildOptions build_opts{};
+ const auto n0 = _blueprint->impl().get_execution_window().x().step();
+ const auto m0 = _blueprint->impl().get_execution_window().y().step();
+ const auto partial_m0 = t_dst_info->dimension(1) % m0;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
return build_opts;
}
@@ -142,34 +143,56 @@ std::string ClElementwiseAddKernelComponent::generate_config_id() const
return config_id;
}
-ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
{
- // Determine which argument is the accumulator
- Link accumulator;
- Link addend;
- if(_lhs.group == SharedVarGroup::Automatic)
+ const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+ vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
+ vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs");
+ if(is_root)
{
- accumulator = _lhs;
- addend = _rhs;
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
}
- else if(_rhs.group == SharedVarGroup::Automatic)
+}
+
+ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
+{
+ TagLUT lut{};
+ const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
+ // Arguments and global shared variables
+ const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
+ if(is_root)
{
- accumulator = _rhs;
- addend = _lhs;
+ lut["lhs"] = vtable.get(_lhs);
+ lut["rhs"] = vtable.get(_rhs);
+ lut["dst"] = vtable.get(_dst);
}
else
{
- ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+ // Determine which link is the accumulator
+ Link accumulator;
+ Link addend;
+ if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic)
+ {
+ accumulator = _lhs;
+ addend = _rhs;
+ }
+ else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic)
+ {
+ accumulator = _rhs;
+ addend = _lhs;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Invalid elementwise component linking");
+ }
+ lut["acc"] = vtable.get(accumulator);
+ lut["addend"] = vtable.get(addend);
}
- return {
- { "meta_kernel_id", id() },
- { "acc", vtable.add(accumulator, ClKernelArgRuntimeDescriptor(accumulator.arg_id, TensorArgType::Image_3D), "add_acc") },
- { "addend", vtable.add(addend, ClKernelArgRuntimeDescriptor(addend.arg_id, TensorArgType::Image_3D), "add_addend") },
- // {"dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst")}, // dst is needed for the root version and/or non-inplace version should we need one
- };
+ // Local build options
+ lut["meta_kernel_id"] = id();
+ lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type());
+ return lut;
}
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
index 35c9538b8d..4f7b69724d 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
@@ -37,7 +39,7 @@ namespace dynamic_fusion
class ClElementwiseAddKernelComponent : public IClKernelComponent
{
public:
- ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
+ ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst)
: IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
{
}
@@ -54,7 +56,8 @@ public:
return { _lhs, _rhs, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::string name() const override
{
@@ -70,6 +73,4 @@ private:
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
deleted file mode 100644
index 45b81b424d..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp
+++ /dev/null
@@ -1,555 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/utils/helpers/float_ops.h"
-#include "support/StringSupport.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClGemmNativeKernelComponent::get_component_type() const
-{
- return ComponentType::Complex;
-}
-
-std::set<std::string> ClGemmNativeKernelComponent::get_headers_list() const
-{
- return std::set<std::string> { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" };
-}
-
-Window ClGemmNativeKernelComponent::get_window() const
-{
- ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
- ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
-
- bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d;
- bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0;
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and dst have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_output_as_3d = false;
- }
-
- // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation
- GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d,
- _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width,
- _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset);
-
- // dst tensor auto initialization if not yet initialized
- auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info)));
-
- TensorInfo tmp_info(*dst_info);
-
- if(reinterpret_output_as_3d)
- {
- // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(dst_info->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
- win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0));
-
- AccessWindowStatic src0_access(lhs_info, 0, 0,
- lhs_info->dimension(0),
- lhs_info->dimension(1));
- AccessWindowStatic src1_access(rhs_info, 0, 0,
- ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0),
- rhs_info->dimension(1));
- AccessWindowStatic dst_access(dst_info, 0, 0,
- dst_info->dimension(0),
- dst_info->dimension(1));
-
- if(bias_info != nullptr)
- {
- const int bias_processed_per_iteration_x = _desc.rhs_info.n0;
-
- AccessWindowStatic src2_access(bias_info, 0, 0,
- ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x),
- bias_info->dimension(1));
-
- window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor
- }
- else
- {
- window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor
- }
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst_info->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- if(window_changed == true)
- {
- ARM_COMPUTE_ERROR("Insufficient Padding!");
- }
-
- return collapsed;
-}
-
-std::string ClGemmNativeKernelComponent::get_additional_macros() const
-{
- return R"_(
-#define VFMA(a, b, c) \
-({ \
- c = fma(a, b, c); \
-})
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c) \
- ({ \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
- VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
- })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-)_";
-}
-
-std::string ClGemmNativeKernelComponent::get_component_code() const
-{
- auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
-
- auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha));
- auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
- auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
-
- std::string code = R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ---------------------
- // IN_0(lhs) {{lhs}}
- // IN_1(rhs) {{rhs}}
- )_";
-
- if(!_bias.is_empty())
- {
- code += R"_(
- // IN_2(bias) {{bias}}
- )_";
- }
-
- code += R"_(
- // OUT(dst, accum) {{dst}}
-
- // Initialize the accumulators
- REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), {{dst}}, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;
- {
-#if defined(DUMMY_WORK_ITEMS)
- if((g_x * N0 >= N) || (g_y * M0 >= M))
- {
- return;
- }
-#endif // defined(DUMMY_WORK_ITEMS)
-
- // Compute LHS matrix address
- uint lhs_offset = {{lhs}}_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * (uint){{lhs}}_stride_y;
-
- // Compute RHS matrix address
- uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE);
- )_";
-
- if(dont_slide_b)
- {
- code += R"_(
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z;
- )_";
- }
- else
- {
- code += R"_(
- rhs_offset += g_z * {{rhs}}_stride_z;
- )_";
- }
-
- code += R"_(
- REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
- )_";
-
- if(reinterpret_input_as_3d)
- {
- code += R"_(
- // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D
- CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply lhs_stride_z by DEPTH_GEMM3D
- lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}};
- )_";
- }
- else
- {
- code += R"_(
- // Add offset for batched GEMM
- lhs_offset += g_z * {{lhs}}_stride_z;
- )_";
- }
-
- code += R"_(
- int i = 0;
-#if {{K0}} > 1
- for(; i <= (K - {{K0}}); i += {{K0}})
- {
- // Supported cases (M0, K0):
- // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
- // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
- // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
- // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
- // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
- // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
- // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
- // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
- // Load values from LHS matrix
- LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs);
-
- // Load values from RHS matrix
- LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero);
-
- RHS_VFMA_M0xN0(0, a, b0, {{dst}});
- RHS_VFMA_M0xN0(1, a, b1, {{dst}});
-#if {{K0}} > 2
- RHS_VFMA_M0xN0(2, a, b2, {{dst}});
-#endif // K0 > 2
-#if {{K0}} > 3
- RHS_VFMA_M0xN0(3, a, b3, {{dst}});
-#endif // K0 > 3
-#if {{K0}} > 4
- RHS_VFMA_M0xN0(4, a, b4, {{dst}});
- RHS_VFMA_M0xN0(5, a, b5, {{dst}});
- RHS_VFMA_M0xN0(6, a, b6, {{dst}});
- RHS_VFMA_M0xN0(7, a, b7, {{dst}});
-#endif // K0 > 4
-#if {{K0}} > 8
- RHS_VFMA_M0xN0(8, a, b8, {{dst}});
- RHS_VFMA_M0xN0(9, a, b9, {{dst}});
- RHS_VFMA_M0xN0(A, a, bA, {{dst}});
- RHS_VFMA_M0xN0(B, a, bB, {{dst}});
- RHS_VFMA_M0xN0(C, a, bC, {{dst}});
- RHS_VFMA_M0xN0(D, a, bD, {{dst}});
- RHS_VFMA_M0xN0(E, a, bE, {{dst}});
- RHS_VFMA_M0xN0(F, a, bF, {{dst}});
-#endif // K0 > 8
-
- lhs_offset += {{K0}} * sizeof(DATA_TYPE);
- rhs_offset += {{K0}} * {{rhs}}_stride_y;
- }
-#endif // K0 > 1
- // Left-over accumulations
- for(; i < K; ++i)
- {
- // Load values from LHS matrix
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 0 * {{lhs}}_stride_y + zlhs0));
-#if M0 > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 1 * {{lhs}}_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 2 * {{lhs}}_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 3 * {{lhs}}_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 4 * {{lhs}}_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a5 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 5 * {{lhs}}_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a6 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 6 * {{lhs}}_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a7 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 7 * {{lhs}}_stride_y + zlhs7));
-#endif // M0 > 7
-
- VEC_DATA_TYPE(DATA_TYPE, N0)
- b = VLOAD(N0)(0, (__global DATA_TYPE *)({{rhs}}_ptr + rhs_offset + 0 * {{rhs}}_stride_y));
- RHS_VFMA_M0xN0(0, a, b, {{dst}});
-
- lhs_offset += sizeof(DATA_TYPE);
- rhs_offset += {{rhs}}_stride_y;
- }
-
- // Multiply by the weight of matrix-matrix product and store the result
- )_";
- if(has_alpha)
- {
- code += R"_(
- SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}});
- )_";
- }
-
- if(!_bias.is_empty())
- {
- if(_desc.broadcast_bias)
- {
- code += R"_(
- // Add beta*bias
- __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
- LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
- )_";
-
- if(helpers::float_ops::is_one(_desc.beta))
- {
- code += R"_(
- SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}});
- )_";
- }
-
- code += R"_(
- // c = c + bias[broadcasted]
- ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0);
- )_";
- }
- else
- {
- code += R"_(
- // Add beta*bias
- __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0,
- PARTIAL_STORE_M0)
- * {{bias}}_stride_y)
- + g_z * {{bias}}_stride_z;
-
- LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero);
- )_";
-
- if(helpers::float_ops::is_one(_desc.beta))
- {
- code += R"_(
- SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}});
- )_";
- }
-
- code += R"_(
- // c = c + bias
- ADD_BLOCK(M0, {{dst}}, bias);
- )_";
- }
- }
-
- code += R"_(
- }
- //------------------ END KERNEL {{meta_kernel_id}} ---------------------
- )_";
- return code.c_str();
-}
-
-CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- auto tile_info = _blueprint->impl().get_tile_info();
-
- CLBuildOptions build_opts{};
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y()));
- build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x()));
- build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
-
- return build_opts;
-}
-
-std::string ClGemmNativeKernelComponent::generate_config_id() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- std::string config_id{};
- config_id += (_bias.is_empty() ? "add_bias_" : "");
- config_id += (_desc.broadcast_bias ? "broadcast_bias_" : "");
- config_id += (_desc.reinterpret_input_as_3d ? "3di_" : "");
- config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : "");
- config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(1));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(0));
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.k);
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(2));
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.lhs_info.m0);
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.rhs_info.n0);
- config_id += "_";
- config_id += support::cpp11::to_string(_desc.rhs_info.k0);
- return config_id;
-}
-
-ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const
-{
- TagLUT lut{};
-
- lut["meta_kernel_id"] = id();
- lut["lhs"] = vtable.add(_lhs, ClKernelArgRuntimeDescriptor(_lhs.arg_id, TensorArgType::Image_3D), "lhs");
- lut["rhs"] = vtable.add(_rhs, ClKernelArgRuntimeDescriptor(_rhs.arg_id, TensorArgType::Image_3D), "rhs");
- if(!_bias.is_empty()) // optional bias
- {
- lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias");
- }
- lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst");
-
- // Local build options
- auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha));
- auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr;
- auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0;
- auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0;
- auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions();
-
- lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0);
-
- if(has_alpha)
- {
- lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha);
- }
- if(has_beta)
- {
- lut["BETA"] = float_to_string_with_full_precision(_desc.beta);
- }
- if(dont_slide_b)
- {
- lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2));
- }
-
- if(reinterpret_output_as_3d)
- {
- lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1));
- lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2));
- }
- else if(reinterpret_input_as_3d)
- {
- lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1));
- lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2));
- }
-
- return lut;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
deleted file mode 100644
index b282856b56..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-
-#include "arm_compute/core/Steps.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClGemmNativeKernelComponent : public IClKernelComponent
-{
-public:
- ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc,
- const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{})
- : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst }
- {
- }
-
- ComponentType get_component_type() const override;
- std::set<std::string> get_headers_list() const override;
- std::string get_additional_macros() const override;
- std::string get_component_code() const override;
- Window get_window() const override;
- ClKernelArgList get_args();
- CLBuildOptions generate_build_options() const override;
- std::string generate_config_id() const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _lhs, _rhs, _bias, _dst };
- }
-
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
- virtual std::string name() const override
- {
- return "gemm_mm_native_" + std::to_string(id());
- }
-
-private:
- GemmNativeDescriptor _desc{};
- Link _lhs{};
- Link _rhs{};
- Link _bias{};
- Link _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
index de02f948e9..c6716a0a23 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
@@ -21,16 +21,15 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h"
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
index 5f023ba528..e0b210f4ed 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
@@ -65,25 +67,36 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const
CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
{
auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- auto tile_info = _blueprint->impl().get_tile_info();
+ // auto tile_info = _blueprint->impl().get_tile_info();
CLBuildOptions build_opts{};
+ const auto n0 = _blueprint->impl().get_execution_window().x().step();
+ const auto m0 = _blueprint->impl().get_execution_window().y().step();
+ const auto partial_m0 = t_dst_info->dimension(0) % m0;
+ const auto partial_n0 = t_dst_info->dimension(1) % n0;
+
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y()));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x()));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y()));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x()));
+ build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+ build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
+ build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0));
return build_opts;
}
-ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+ vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src");
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst");
+}
+
+ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
{
return {
{ "meta_kernel_id", id() },
- { "src", vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src") },
- { "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") },
+ { "src", vtable.get(_src) },
+ { "dst", vtable.get(_dst) },
};
}
@@ -96,19 +109,26 @@ std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() cons
{
return R"_(
//------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
+ {
+ #define _IDST_WIDTH {{dst}}_w
+ #define _IDST_HEIGHT {{dst}}_h
+ TILE(uint, M0, 1, dst_indirect_y);
- TILE(uint, M0, 1, dst_indirect_y);
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, i, 0, 1, M0,
+ {
+ dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+ dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1);
- dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}});
- })
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
- T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y);
+ T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
- //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+ #undef _IDST_WIDTH
+ #undef _IDST_HEIGHT
+ //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
+ }
)_";
}
@@ -120,21 +140,24 @@ CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options
return build_opts;
}
-ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const
+void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
+{
+ vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
+ vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
+}
+
+ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
{
TagLUT lut{};
- lut["meta_kernel_id"] = id();
- lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src");
- lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst");
+ // Arguments and global shared variables
+ lut["src"] = vtable.get(_src);
+ lut["dst"] = vtable.get(_dst);
// Local build options
- auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- lut["dst_w"] = dst_info->dimension(1);
- lut["dst_h"] = dst_info->dimension(2);
-
+ lut["meta_kernel_id"] = id();
lut["DST_TENSOR_TYPE"] = "BUFFER";
+ const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
lut["DST_DATA_TYPE"] = dst_info->data_type();
return lut;
@@ -142,6 +165,4 @@ ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKern
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
index c7da8bd3e8..26883d7fa0 100644
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
@@ -21,7 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
@@ -37,21 +39,21 @@ namespace dynamic_fusion
class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
{
public:
- ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+ ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
: IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
{
}
ComponentType get_component_type() const override;
std::string get_component_code() const override;
CLBuildOptions generate_build_options() const override;
+ TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::vector<Link> get_links() const override
{
return { _src, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
virtual std::string name() const override
{
return "";
@@ -65,21 +67,21 @@ private:
class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent
{
public:
- ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
+ ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
: IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
{
}
ComponentType get_component_type() const override;
std::string get_component_code() const override;
CLBuildOptions generate_build_options() const override;
+ virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
+ void allocate_shared_vars(SharedVarTable &vtable) const override;
virtual std::vector<Link> get_links() const override
{
return { _src, _dst };
}
- virtual TagLUT allocate_vars(SharedVarTable &vtable) const override;
-
virtual std::string name() const override
{
return "";
@@ -93,6 +95,4 @@ private:
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
new file mode 100644
index 0000000000..5dbf2f660d
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/OperatorGraph.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+void check_dependency_graph_op_success(OperatorGraph &graph, const Status &status)
+{
+ if(!bool(status))
+ {
+ graph.impl()->status = Status{ status.error_code(), "Cycles or loops are not allowed" };
+ }
+}
+
+// Check if there are more than one roots in the graph
+void check_multiple_roots(OperatorGraph &graph)
+{
+ if(graph.impl()->graph.get_root_ops().size() > 1)
+ {
+ graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Multiple roots are not allowed" };
+ }
+}
+
+void check_execution_shape(OperatorGraph &graph, const ITensorInfo &dst_info)
+{
+ const auto roots = graph.impl()->graph.get_root_ops();
+ for(auto root : roots)
+ {
+ // We assume exactly 1 dst tensor for all operators
+ const auto root_info = graph.impl()->tensors[graph.impl()->graph.dst_tensors(root)[0]]->get_tensor_info();
+ for(unsigned int dim = 0; dim < root_info->num_dimensions(); ++dim)
+ {
+ if(root_info->dimension(dim) != dst_info.dimension(dim))
+ {
+ graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot change execution space" };
+ return;
+ }
+ }
+ }
+}
+} // namespace
+
+OpTensor::OpTensor(Id id)
+ : _id{ id }
+{
+}
+
+OpTensor::Id OpTensor::id() const
+{
+ return _id;
+}
+
+bool operator<(const OpTensor &t0, const OpTensor &t1)
+{
+ return t0.id() < t1.id();
+}
+
+Operator::Operator(Id id)
+ : _id{ id }
+{
+}
+
+Operator::Id Operator::id() const
+{
+ return _id;
+}
+
+bool operator<(const Operator &op0, const Operator &op1)
+{
+ return op0.id() < op1.id();
+}
+
+OperatorGraph::OperatorGraph()
+ : _impl{ std::make_unique<Implementation>() }
+{
+}
+
+OperatorGraph::~OperatorGraph() = default;
+
+OperatorGraph::Implementation *OperatorGraph::impl()
+{
+ return _impl.get();
+}
+
+const OperatorGraph::Implementation *OperatorGraph::impl() const
+{
+ return _impl.get();
+}
+
+Status validate(const OperatorGraph &graph)
+{
+ return graph.impl()->status;
+}
+
+OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info)
+{
+ auto id = graph.impl()->graph.add_tensor();
+ OpTensor op_tensor(id);
+ graph.impl()->add_tensor(id, &info);
+ return op_tensor;
+}
+
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst)
+{
+ // Check if map is empty as a complex operator can only be root
+ if(!graph.impl()->graph.get_root_ops().empty())
+ {
+ graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot add multiple complex operators" };
+ return Operator{};
+ }
+
+ std::pair<Status, DependencyGraph::Id> status_id;
+
+ if(bias.id() == -1)
+ {
+ status_id = graph.impl()->graph.add_operator({ input.id(), weights.id() }, { dst.id() });
+ }
+ else
+ {
+ status_id = graph.impl()->graph.add_operator({ input.id(), weights.id(), bias.id() }, { dst.id() });
+ }
+
+ check_dependency_graph_op_success(graph, status_id.first);
+
+ Operator op_node(status_id.second);
+
+ // Infer TensorInfo
+ OpTensorContent *dst_tensor = graph.impl()->tensors[dst.id()].get();
+ if(dst_tensor->get_tensor_info()->total_size() == 0)
+ {
+ auto src = graph.impl()->tensors[input.id()]->get_tensor_info();
+ auto wts = graph.impl()->tensors[weights.id()]->get_tensor_info();
+ auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wts->tensor_shape(), PadStrideInfo(desc.stride.x(), desc.stride.y(), desc.pad.left,
+ desc.pad.right,
+ desc.pad.top, desc.pad.bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+
+ auto_init_if_empty(*(dst_tensor->get_tensor_info()), src->clone()->set_tensor_shape(shape));
+ }
+
+ // Check execution space
+ auto dst_info = dst_tensor->get_tensor_info();
+ check_execution_shape(graph, *dst_info);
+
+ ITensorDescPack<OpTensorContent> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[input.id()].get());
+ tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[weights.id()].get());
+ if(bias.id() != -1)
+ {
+ tensors.add_const_tensor(ACL_SRC_2, graph.impl()->tensors[bias.id()].get());
+ }
+ tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
+
+ graph.impl()->add_node<Conv2dContent>(status_id.second, desc, tensors);
+ check_multiple_roots(graph);
+
+ return op_node;
+}
+
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst)
+{
+ return add_op_conv2d(graph, desc, input, weights, OpTensor(-1), dst);
+}
+
+void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method)
+{
+ auto node = utils::cast::polymorphic_downcast<Conv2dContent *>(graph.impl()->operators[conv2d.id()].get());
+ node->set_method(method);
+}
+
+Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
+{
+ auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() });
+ check_dependency_graph_op_success(graph, id.first);
+
+ Operator op_node(id.second);
+
+ // Infer TensorInfo
+ auto node_lhs = graph.impl()->tensors[lhs.id()]->get_tensor_info();
+ auto node_rhs = graph.impl()->tensors[rhs.id()]->get_tensor_info();
+ OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get();
+
+ if(node_dst->get_tensor_info()->total_size() == 0)
+ {
+ const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*node_rhs, *node_lhs);
+ auto_init_if_empty(*(node_dst->get_tensor_info()), node_lhs->clone()->set_tensor_shape(broadcast_pair.first));
+ }
+
+ // Check execution space
+ auto dst_info = node_dst->get_tensor_info();
+ check_execution_shape(graph, *dst_info);
+
+ ITensorDescPack<OpTensorContent> tensors;
+ tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get());
+ tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get());
+ tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
+ graph.impl()->add_node<AddContent>(id.second, desc, tensors);
+ check_multiple_roots(graph);
+
+ return op_node;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp
new file mode 100644
index 0000000000..7e9f6b870a
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> get_combinations(const std::vector<ClKernelFusionGroup *> &sorted_fgs)
+{
+ ARM_COMPUTE_ERROR_ON(sorted_fgs.size() <= 1);
+ std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> combo;
+ for(size_t i = 0; i < sorted_fgs.size() - 1; ++i)
+ {
+ for(size_t j = i + 1; j < sorted_fgs.size(); ++j)
+ {
+ combo.push_back(std::make_pair(sorted_fgs.at(i), sorted_fgs.at(j)));
+ }
+ }
+ return combo;
+}
+} // namespace
+std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group)
+{
+ std::vector<const ClKernel *> kernels;
+ const auto sorted = group.graph.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ kernels.push_back(group.fused_kernels.at(pack.op));
+ }
+ return kernels;
+}
+
+std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph)
+{
+ std::vector<const ClKernelFusionGroup *> kernels;
+ const auto sorted = graph.fg_dependency.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ kernels.push_back(graph.fusion_groups.at(pack.op).get());
+ }
+ return kernels;
+}
+
+std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph)
+{
+ std::vector<ClKernelFusionGroup *> kernels;
+ const auto sorted = graph.fg_dependency.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ kernels.push_back(graph.fusion_groups.at(pack.op).get());
+ }
+ return kernels;
+}
+
+std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph)
+{
+ ClFusedKernelGraph fused_kernel_graph{};
+ fused_kernel_graph.original_graph = &kernel_graph; // Create a copy of the original kernel graph
+ fused_kernel_graph.fg_dependency = DependencyGraph();
+ // Initialize all fusion groups
+ for(const auto &kernel : traverse(kernel_graph))
+ {
+ fused_kernel_graph.add_fusion_group({ kernel });
+ }
+ return { Status{}, fused_kernel_graph };
+}
+
+Status fuse(ClFusedKernelGraph &fused_kernel_graph)
+{
+ // A naive fusion algorithm that's guaranteed to find optimal pattern if there are no branches
+ // If there are branches, the algorithm cannot guanrantee optimality as it doesn't perform any searches
+
+ bool fusion_found = false;
+ do
+ {
+ fusion_found = false;
+ const auto sorted_fgs = traverse(fused_kernel_graph);
+ if(sorted_fgs.size() <= 1)
+ {
+ // Only one or zero fusion group, thus no need to perform fusion
+ return Status{};
+ }
+ auto fgs_combo = get_combinations(sorted_fgs);
+ for(auto fgs : fgs_combo)
+ {
+ auto fg0 = fgs.first;
+ auto fg1 = fgs.second;
+ const auto st = fused_kernel_graph.can_fuse(*fg0, *fg1);
+ if(bool(st))
+ {
+ const auto st = fused_kernel_graph.fuse(*fg0, *fg1);
+ if(!bool(st))
+ {
+ return st;
+ }
+ fusion_found = true;
+ break;
+ }
+ }
+ }
+ while(fusion_found);
+ return Status{};
+}
+Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg)
+{
+ Status st{};
+ for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg.id))
+ {
+ const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id);
+
+ /// NOTE: dst tensor must have already been added to the blueprint at this point
+ ArgumentID dst_id;
+ st = add_tensor(bp, dst_t->desc, dst_id, dst_t->id);
+ if(!bool(st))
+ {
+ return st;
+ }
+ /// NOTE: the extra dst tensor is needed as the store kcomp requires 2 tensors. But this is irrelevant to the fused kernel graph
+ /// since both tensors share the exact same info and kernel arg descriptor
+ ArgumentID dst_dst_id;
+ st = add_tensor(bp, dst_t->desc, dst_dst_id);
+ if(!bool(st))
+ {
+ return st;
+ }
+ /// NOTE: Update the merge point map to link dst_dst_id with dst_t->id instead.
+ /// This is required because the get_arguments() returned by the blueprint returns the dst tensor added by the store component
+ st = update_merge_point(bp, dst_dst_id, dst_t->id);
+ if(!bool(st))
+ {
+ return st;
+ }
+ st = add_kcomp_store(bp, fg.get_root_kernel()->config().store_type, dst_id, dst_dst_id);
+ if(!bool(st))
+ {
+ return st;
+ }
+ }
+ return st;
+}
+
+Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph)
+{
+ workload.context = ctx;
+ for(const auto &fg : traverse(fused_kernel_graph))
+ {
+ ClKernelBlueprint bp{};
+ for(const auto &kernel : traverse(*fg))
+ {
+ const auto st = kernel->generate(bp);
+ if(!bool(st))
+ {
+ return st;
+ }
+ }
+ auto st = set_tile_info(bp, fg->get_root_kernel()->config().tile_desc);
+ if(!bool(st))
+ {
+ return st;
+ }
+ st = generate_store(bp, fused_kernel_graph, *fg);
+ if(!bool(st))
+ {
+ return st;
+ }
+
+ ClKernelCode code{};
+ st = build(code, ClCodeBuilderContext{ ctx.gpu_info }, bp);
+ if(!bool(st))
+ {
+ return st;
+ }
+ const auto bp_graph = get_dependency_graph(bp);
+
+ // Get tensor info
+ std::vector<Id> workload_src_tensors{};
+ for(const auto &src_t_id : fused_kernel_graph.fg_dependency.src_tensors(fg->id))
+ {
+ const auto src_t = fused_kernel_graph.original_graph->get_tensor(src_t_id);
+ // Get corresponding kernel arg descriptor
+ const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(src_t->id));
+ const auto kernel_t_id = workload.add_workload_tensor(src_t->desc, src_t->memory_type, src_t->memory_info, arg_desc, src_t->id);
+ workload_src_tensors.push_back(kernel_t_id);
+ }
+ std::vector<Id> workload_dst_tensors{};
+ for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg->id))
+ {
+ const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id);
+ // Get corresponding kernel arg descriptor
+ const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(dst_t->id));
+ const auto kernel_t_id = workload.add_workload_tensor(dst_t->desc, dst_t->memory_type, dst_t->memory_info, arg_desc, dst_t->id);
+ workload_dst_tensors.push_back(kernel_t_id);
+ }
+
+ workload.add_unit_workload(fg->get_root_kernel()->config().stage, code, workload_src_tensors, workload_dst_tensors);
+ }
+
+ return Status{};
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
new file mode 100644
index 0000000000..4bd3cd9d8b
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "support/DeepCopy.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClKernelFusionGroup;
+
+/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
+ *
+ */
+struct ClKernelFusionGroup
+{
+public:
+ using Id = DependencyGraph::Id;
+
+ ClKernelFusionGroup() = default;
+ ClKernelFusionGroup(Id id)
+ : id{ id }, graph{}, fused_kernels{}, tensors{}
+ {
+ }
+ ~ClKernelFusionGroup() = default;
+
+ void set_id(Id i)
+ {
+ id = i;
+ }
+
+ Id add_fused_kernel(const ClKernel *kernel)
+ {
+ /// PRE: Acyclicity ensured by DependencyGraph
+ /// PRE: Connectedness ensured by DependencyGraph
+ /// PRE: Single-rootedness ensured by User
+ std::vector<Id> src_tensors;
+ for(const auto t : kernel->tensors().get_const_src_tensors())
+ {
+ auto id = graph.add_tensor(t->id);
+ if(tensors.find(id) == tensors.end())
+ {
+ tensors[id] = t;
+ }
+ src_tensors.push_back(id);
+ }
+ std::vector<Id> dst_tensors;
+ for(const auto t : kernel->tensors().get_const_dst_tensors())
+ {
+ auto id = graph.add_tensor(t->id);
+ if(tensors.find(id) == tensors.end())
+ {
+ tensors[id] = t;
+ }
+ dst_tensors.push_back(id);
+ }
+ auto id = graph.add_operator(src_tensors, dst_tensors);
+ fused_kernels[id.second] = kernel;
+ return id.second;
+ }
+
+ const ClKernel *get_root_kernel() const
+ {
+ auto root_kernels = graph.get_root_ops();
+ ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
+ return fused_kernels.at(root_kernels.at(0));
+ }
+
+ std::vector<const ClKernelTensor *> get_src_tensors() const
+ {
+ std::vector<const ClKernelTensor *> src_tensors;
+ for(auto tensor_id : graph.src_tensors())
+ {
+ src_tensors.push_back(tensors.at(tensor_id));
+ }
+ return src_tensors;
+ }
+
+ std::vector<const ClKernelTensor *> get_dst_tensors() const
+ {
+ std::vector<const ClKernelTensor *> dst_tensors;
+ for(auto tensor_id : graph.dst_tensors())
+ {
+ dst_tensors.push_back(tensors.at(tensor_id));
+ }
+ return dst_tensors;
+ }
+
+ friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
+ {
+ return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
+ }
+
+ Id id{};
+ DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
+ std::map<Id, const ClKernel *> fused_kernels{};
+ std::map<Id, const ClKernelTensor *> tensors{};
+};
+
+std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
+
+struct ClFusedKernelGraph
+{
+public:
+ using Id = DependencyGraph::Id;
+
+ using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
+
+ ClFusedKernelGraph() = default;
+ ~ClFusedKernelGraph() = default;
+ ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
+ ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
+ ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default;
+ ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
+
+ friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
+ {
+ /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
+ /// The fusion groups can already guarantee the equivalence of fusion
+ /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
+ return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
+ }
+
+ Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
+ {
+ auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
+ for(const auto k : fused_kernels)
+ {
+ fg->add_fused_kernel(k);
+ }
+ const auto src_tensors = fg->get_src_tensors();
+ const auto dst_tensors = fg->get_dst_tensors();
+ std::vector<Id> inputs{};
+ std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
+ {
+ return fg_dependency.add_tensor(kernel->id);
+ });
+ std::vector<Id> outputs{};
+ std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
+ {
+ return fg_dependency.add_tensor(kernel->id);
+ });
+ const auto id = fg_dependency.add_operator(inputs, outputs);
+ fg->set_id(id.second);
+ fusion_groups[id.second] = std::move(fg);
+ return id.second;
+ }
+
+ Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
+ {
+ /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
+ ClKernelFusionGroup *fg_src{};
+ ClKernelFusionGroup *fg_dst{};
+ // Find fg_src (parent / root) and fg_dst (child / non-root)
+ if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
+ {
+ fg_src = &fg0;
+ fg_dst = &fg1;
+ }
+ else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
+ {
+ fg_src = &fg1;
+ fg_dst = &fg0;
+ }
+ else
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
+ }
+
+ for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
+ {
+ if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
+ {
+ // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
+
+ // Before:
+ // fg_src
+ // |
+ // .. t1
+ // | |
+ // -> fg_dst <-
+ //
+ // After:
+ // fg_src <---t1
+ //
+ const auto st = link_src_tensors(fg_src->id, { t });
+ if(!bool(st))
+ {
+ return st;
+ }
+ }
+ else
+ {
+ const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
+ if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
+ {
+ // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
+ // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
+
+ // Before:
+ // fg_src
+ // |
+ // t0
+ // |
+ // -> fg_dst
+ //
+ // After:
+ // fg_src
+ //
+ const auto st = remove_fg_tensor(t);
+ if(!bool(st))
+ {
+ return st;
+ }
+ }
+ else
+ {
+ // If the tensors ARE in between fg_src and fg_dst
+ // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
+
+ // Before:
+ // fg_src
+ // |
+ // t0
+ // |
+ // |-----------
+ // | |
+ // -> fg_dst -> fg_other
+ //
+ // After:
+ // fg_src
+ // |
+ // t0
+ // |
+ // -> fg_other
+ //
+
+ // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
+ // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
+ // so that any unfused fg's (fg_other in this case) can read it.
+ // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
+ // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
+ // can be termed "dst tensors" to a fusion group
+ void();
+ }
+ }
+ }
+
+ for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
+ {
+ // Link any outgoing tensors of fg_dst to fg_src
+
+ // Before:
+ // fg_src
+ // |
+ // ..
+ // |
+ // -> fg_dst
+ // |
+ // |--------
+ // | |
+ // |-> t0 |-> t1
+ //
+ // After:
+ // fg_src
+ // |
+ // |--------
+ // | |
+ // |-> t0 |-> t1
+ //
+ const auto st = link_dst_tensors(fg_src->id, { t });
+ if(!bool(st))
+ {
+ return st;
+ }
+ }
+
+ // Merge fg_dst's graph into fg_src's graph
+ for(const auto kernel : traverse(*fg_dst))
+ {
+ fg_src->add_fused_kernel(kernel);
+ }
+
+ const auto st = remove_fg(fg_dst->id);
+ return st;
+ }
+ Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
+ {
+ /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
+ /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
+ /// Note that this does not apply to fusion groups
+ /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
+ /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
+ /// INV0: All Fusion groups have a single root
+ /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
+ /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
+ /// INV3: All non-roots are Simple kernels
+ /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
+ /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
+ const ClKernelFusionGroup *fg_src {};
+ const ClKernelFusionGroup *fg_dst{};
+
+ // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
+ // This guarantess INV0
+ // This also finds fg_src (parent / root) and fg_dst (child / non-root)
+ if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
+ {
+ fg_src = &fg0;
+ fg_dst = &fg1;
+ }
+ else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
+ {
+ fg_src = &fg1;
+ fg_dst = &fg0;
+ }
+ else
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
+ }
+
+ // Find unconnected tensors between fg_src and fg_dst
+ std::vector<Id> unconnected_tensors{};
+ for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
+ {
+ if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
+ {
+ unconnected_tensors.push_back(t);
+ }
+ }
+
+ // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
+ // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
+ for(const auto &t : unconnected_tensors)
+ {
+ if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
+ }
+ }
+
+ // Check 2: All non-root fgs are simple. Ensure INV3
+ if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
+ }
+
+ // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
+ const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
+ ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
+ const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
+
+ for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
+ {
+ const auto t_info = original_graph->get_tensor(t)->desc;
+ if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
+ }
+ }
+
+ // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
+ if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
+ }
+
+ return Status{};
+ }
+
+ const ClKernelGraph *original_graph{};
+ DependencyGraph fg_dependency{};
+ KernelFusionGroupMap fusion_groups{};
+ // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
+
+private:
+ Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
+ {
+ for(auto t : src_tensors)
+ {
+ fg_dependency.link_input(fg, t);
+ }
+ return Status{};
+ }
+ Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
+ {
+ for(auto t : dst_tensors)
+ {
+ fg_dependency.link_output(fg, t);
+ }
+ return Status{};
+ }
+ Status remove_fg(Id fg)
+ {
+ fg_dependency.remove_operator(fg);
+ fusion_groups.erase(fg);
+ return Status{};
+ }
+ Status remove_fg_tensor(Id tensor)
+ {
+ fg_dependency.remove_tensor(tensor);
+ return Status{};
+ }
+};
+
+std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
+std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
+
+std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
+
+Status fuse(ClFusedKernelGraph &fused_kernel_graph);
+
+Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
+
+Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
new file mode 100644
index 0000000000..cdd2b2e552
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
+
+#include "arm_compute/core/experimental/OperatorGraph.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClDirectConv2dKernelDescriptor
+{
+ friend bool operator==(const ClDirectConv2dKernelDescriptor &desc0, const ClDirectConv2dKernelDescriptor &desc1)
+ {
+ return desc0.conv2d == desc1.conv2d;
+ }
+ Conv2dDescriptor conv2d{};
+};
+
+struct ClEltwiseAddKernelDescriptor
+{
+ friend bool operator==(const ClEltwiseAddKernelDescriptor &desc0, const ClEltwiseAddKernelDescriptor &desc1)
+ {
+ return desc0.add == desc1.add;
+ }
+ AddDescriptor add{};
+};
+struct ClActivationKernelDescriptor
+{
+ friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &)
+ {
+ return true;
+ }
+};
+
+enum class ClippingStrategy
+{
+ TOP_LEFT,
+ TOP_RIGHT,
+ BOTTOM_LEFT,
+ BOTTOM_RIGHT,
+};
+/** Component: Store */
+struct TileDescriptor
+{
+ Size2D tile_dims{};
+ Size2D boundaries{};
+ ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT };
+
+ TileDescriptor()
+ {
+ }
+
+ TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip)
+ : tile_dims(dims), boundaries(bound), clipping(clip)
+ {
+ }
+
+ bool empty() const
+ {
+ return (tile_dims.area() == 0) || (boundaries.area() == 0);
+ }
+ friend bool operator==(const TileDescriptor &tile0, const TileDescriptor &tile1)
+ {
+ return tile0.tile_dims == tile1.tile_dims && tile0.boundaries == tile1.boundaries && tile0.clipping == tile1.clipping;
+ }
+};
+enum class StoreType
+{
+ VStore,
+ VStorePartial,
+ StoreRow,
+ ConvertStoreRow,
+ StoreBlock,
+ ConvertStoreBlock,
+ StoreRowPartial,
+ StoreBlockPartial,
+ StoreBlockBoundaryAware,
+ StoreVectorSelect,
+ TStoreIndirectWidthSelect
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
new file mode 100644
index 0000000000..8aaf0946bb
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status ClDirectConv2dKernel::generate(ClKernelBlueprint &bp) const
+{
+ const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst);
+ ArgumentID input_id;
+ add_tensor(bp, input->desc, input_id, input->id);
+ ArgumentID weight_id;
+ add_tensor(bp, weight->desc, weight_id, weight->id);
+ ArgumentID bias_id = g_arg_placeholder;
+ if(bias != nullptr)
+ {
+ add_tensor(bp, bias->desc, bias_id, bias->id);
+ }
+ ArgumentID dst_id;
+ add_tensor(bp, dst->desc, dst_id, dst->id);
+
+ add_kcomp_direct_conv2d(bp, desc, input_id, weight_id, bias_id, dst_id);
+ return Status{};
+}
+Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc)
+{
+ // 1. Check validity
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ }
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, biases);
+ }
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().total_size() == 0);
+ }
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+ // weights shape is correct
+ const DataLayout data_layout = src->data_layout();
+ const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+ // dst shape is correct
+ PadStrideInfo legacy_pad_stride(conv2d_desc.conv2d.stride.x(), conv2d_desc.conv2d.stride.y(), conv2d_desc.conv2d.pad.left, conv2d_desc.conv2d.pad.right, conv2d_desc.conv2d.pad.top,
+ conv2d_desc.conv2d.pad.bottom, DimensionRoundingType{});
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+ misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride));
+
+ // biases shape is correct
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
+ "Biases size and number of dst feature maps should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
+ "Biases should be one dimensional");
+ }
+
+ // 2. Check support level
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+
+ return Status{};
+}
+
+bool ClDirectConv2dKernel::operator==(const ClKernel &other) const
+{
+ const auto converted = *utils::cast::polymorphic_downcast<const ClDirectConv2dKernel *>(&other);
+ return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
+}
+
+Status ClAddKernel::generate(ClKernelBlueprint &bp) const
+{
+ const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+ ArgumentID lhs_id;
+ add_tensor(bp, lhs->desc, lhs_id, lhs->id);
+ ArgumentID rhs_id;
+ add_tensor(bp, rhs->desc, rhs_id, rhs->id);
+ ArgumentID dst_id;
+ add_tensor(bp, dst->desc, dst_id, dst->id);
+
+ add_kcomp_eltwise_add(bp, desc, lhs_id, rhs_id, dst_id);
+ return Status{};
+}
+
+Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
+{
+ // 1. Check validity
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+ // Matching data type
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+
+ // Matching data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst);
+
+ // All tensor infos are initialized
+ ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+
+ // Device requirements are met
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
+
+ const bool in_place = (lhs == dst) || (rhs == dst);
+ const bool src0_in_place = in_place && (lhs == dst);
+
+ // dst shape is correct
+ const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+ if(in_place)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src0_in_place ? lhs->tensor_shape() : rhs->tensor_shape(), 0),
+ "Wrong shape for dst, cannot do in_place calculation");
+ }
+
+ // 2. Check support level
+
+ // Data type
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
+
+ // Data layout
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC);
+
+ return Status{};
+}
+
+bool ClAddKernel::operator==(const ClKernel &other) const
+{
+ const auto converted = *utils::cast::polymorphic_downcast<const ClAddKernel *>(&other);
+ return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
+}
+
+std::vector<const ClKernel *> traverse(const ClKernelGraph &graph)
+{
+ std::vector<const ClKernel *> kernels;
+ const auto sorted = graph.graph.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ kernels.push_back(graph.kernels.at(pack.op).get());
+ }
+ return kernels;
+}
+std::vector<ClKernel *> traverse(ClKernelGraph &graph)
+{
+ std::vector<ClKernel *> kernels;
+ const auto sorted = graph.graph.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ kernels.push_back(graph.kernels.at(pack.op).get());
+ }
+ return kernels;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
new file mode 100644
index 0000000000..1e14afb266
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/DependencyGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h"
+#include "support/DeepCopy.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+struct ClKernelGraph;
+class ClKernelBlueprint;
+
+enum class Complexity
+{
+ Simple,
+ Complex
+};
+
+/** Configurations for ClKernel
+ *
+ */
+struct ClKernelConfig
+{
+ UnitWorkloadStage stage{};
+ TileDescriptor tile_desc{};
+ StoreType store_type{};
+ friend bool operator==(const ClKernelConfig &config0, const ClKernelConfig &config1)
+ {
+ return config0.stage == config1.stage && config0.tile_desc == config1.tile_desc && config0.store_type == config1.store_type;
+ }
+};
+
+struct ClKernelTensor
+{
+public:
+ using Id = DependencyGraph::Id;
+ ClKernelTensor() = default;
+ ClKernelTensor(Id id, ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info)
+ : id{ id }, desc{ desc }, memory_type{ memory_type }, memory_info{ memory_info }
+ {
+ }
+ bool operator==(const ClKernelTensor &other) const
+ {
+ return desc == other.desc;
+ }
+
+ Id id{};
+ ITensorInfo *desc{};
+ MemoryType memory_type{};
+ AuxMemoryInfo memory_info{};
+};
+
+struct ClKernel
+{
+public:
+ using Id = DependencyGraph::Id;
+ ClKernel() = default;
+ virtual ~ClKernel() = default;
+ ClKernel(const ClKernel &kernel) = default;
+ ClKernel &operator=(const ClKernel &kernel) = default;
+ ClKernel(ClKernel &&kernel) = default;
+ ClKernel &operator=(ClKernel &&kernel) = default;
+ ClKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ITensorDescPack<ClKernelTensor> &tensors)
+ : _graph{ graph }, _id{ id }, _config{ config }, _tensors{ tensors }
+ {
+ }
+ virtual bool operator==(const ClKernel &other) const = 0;
+ virtual Complexity complexity() const = 0;
+ virtual Status generate(ClKernelBlueprint &bp) const = 0;
+ Id id() const
+ {
+ return _id;
+ }
+ ITensorDescPack<ClKernelTensor> tensors() const
+ {
+ return _tensors;
+ }
+ ClKernelConfig config() const
+ {
+ return _config;
+ }
+
+protected:
+ const ClKernelGraph *_graph {};
+ Id _id{};
+ ClKernelConfig _config{};
+ ITensorDescPack<ClKernelTensor> _tensors{};
+};
+
+struct ClDirectConv2dKernel : public ClKernel
+{
+public:
+ Complexity complexity() const override
+ {
+ return Complexity::Complex;
+ }
+ ClDirectConv2dKernel() = default;
+ ~ClDirectConv2dKernel() override = default;
+ ClDirectConv2dKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig config, const ClDirectConv2dKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+ : ClKernel{ graph, id, config, tensors }, desc{ desc }
+ {
+ }
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc);
+ bool operator==(const ClKernel &other) const override;
+ Status generate(ClKernelBlueprint &bp) const override;
+
+ ClDirectConv2dKernelDescriptor desc{};
+};
+
+struct ClAddKernel : public ClKernel
+{
+public:
+ Complexity complexity() const override
+ {
+ return Complexity::Simple;
+ }
+ ClAddKernel() = default;
+ ~ClAddKernel() override = default;
+ ClAddKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClEltwiseAddKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
+ : ClKernel{ graph, id, config, tensors }, desc{ desc }
+ {
+ }
+ static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst);
+ bool operator==(const ClKernel &other) const override;
+ Status generate(ClKernelBlueprint &bp) const override;
+
+ ClEltwiseAddKernelDescriptor desc{};
+};
+
+struct ClKernelGraph
+{
+public:
+ using Id = DependencyGraph::Id;
+ using KernelMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernel>>;
+ using KernelTensorMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelTensor>>;
+
+ ClKernelGraph() = default;
+ ~ClKernelGraph() = default;
+
+ friend bool operator==(const ClKernelGraph &graph0, const ClKernelGraph &graph1)
+ {
+ return graph0.graph == graph1.graph && graph0.kernels == graph1.kernels && graph0.tensors == graph1.tensors;
+ }
+
+ Status add_kernel_tensor(ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info, Id &tensor_id, Id merge_point = DependencyGraph::empty_id())
+ {
+ tensor_id = graph.add_tensor(merge_point);
+ if(tensors.find(tensor_id) == tensors.end())
+ {
+ tensors[tensor_id] = utils::memory::make_deep_unique<ClKernelTensor, ClKernelTensor>(tensor_id, desc, memory_type, memory_info);
+ }
+ return Status{};
+ }
+
+ template <typename ContentT, typename KernelDescT>
+ Status add_kernel(const ClKernelConfig &config, const KernelDescT &desc, const ITensorDescPack<ClKernelTensor> &tensors, Id &kernel_id)
+ {
+ const auto src_tensors = tensors.get_const_src_tensors();
+ const auto dst_tensors = tensors.get_const_dst_tensors();
+ std::vector<Id> src_tensor_ids{};
+ std::vector<Id> dst_tensor_ids{};
+ for(const auto &t : src_tensors)
+ {
+ src_tensor_ids.push_back(t->id);
+ }
+ for(const auto &t : dst_tensors)
+ {
+ dst_tensor_ids.push_back(t->id);
+ }
+ kernel_id = graph.add_operator(src_tensor_ids, dst_tensor_ids).second;
+ auto k = utils::memory::make_deep_unique<ClKernel, ContentT>(this, kernel_id, config, desc, tensors);
+ kernels[kernel_id] = std::move(k);
+ return Status{};
+ }
+
+ ClKernel *get_kernel(Id id)
+ {
+ return kernels.at(id).get();
+ }
+ const ClKernel *get_kernel(Id id) const
+ {
+ return kernels.at(id).get();
+ }
+
+ ClKernelTensor *get_tensor(Id id)
+ {
+ return tensors.at(id).get();
+ }
+ const ClKernelTensor *get_tensor(Id id) const
+ {
+ return tensors.at(id).get();
+ }
+
+ DependencyGraph graph{};
+ KernelMap kernels{};
+ KernelTensorMap tensors{};
+};
+using Id = DependencyGraph::Id;
+
+std::vector<const ClKernel *> traverse(const ClKernelGraph &graph);
+std::vector<ClKernel *> traverse(ClKernelGraph &graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp
new file mode 100644
index 0000000000..e97cf88b79
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx)
+{
+ workload.context = ctx;
+ ClKernelGraph kernel_graph;
+ workload.status = validate(op_graph);
+ ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+ workload.status = translate(kernel_graph, *op_graph.impl());
+ ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+ ClFusedKernelGraph fused_k_graph;
+ std::tie(workload.status, fused_k_graph) = init_fusion_graph(kernel_graph);
+ ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+ workload.status = fuse(fused_k_graph);
+ ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+ workload.status = generate(workload, ctx, fused_k_graph);
+ ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
+
+ // Get operator tensor id to workload tensor id map
+ const auto op_tensor_to_kernel_tensor = fused_k_graph.original_graph->graph.get_merge_points();
+ const auto kernel_tensor_to_workload_tensor = workload.graph.get_merge_points();
+ for(const auto op_t : op_graph.impl()->graph.src_tensors())
+ {
+ const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t);
+ const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t);
+ workload.op_tensor_id_lut[workload_t] = op_t;
+ }
+ for(const auto op_t : op_graph.impl()->graph.dst_tensors())
+ {
+ const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t);
+ const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t);
+ workload.op_tensor_id_lut[workload_t] = op_t;
+ }
+ return workload.status;
+}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp
new file mode 100644
index 0000000000..2e8292bbfb
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/DependencyGraph.h"
+
+#include <algorithm>
+#include <deque>
+#include <set>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+DependencyGraph::DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points)
+ : _adj_src_tensors{ adj_src_tensors }, _adj_dst_tensors{ adj_dst_tensors }, _adj_src_ops{ adj_src_ops }, _adj_dst_ops{ adj_dst_ops }, _merge_to_internal{ merge_points }, _operator_id{}, _tensor_id{}
+{
+}
+DependencyGraph::DependencyGraph(const std::vector<Id> &imported_tensors)
+ : _adj_src_tensors{}, _adj_dst_tensors{}, _adj_src_ops{}, _adj_dst_ops{}, _merge_to_internal{}, _operator_id{}, _tensor_id{}
+{
+ for(auto t : imported_tensors)
+ {
+ _adj_src_ops[t] = {};
+ _adj_dst_ops[t] = {};
+ }
+}
+
+Status DependencyGraph::update_merge_point(Id t_id, Id merge_point)
+{
+ if(_merge_to_internal.find(merge_point) == _merge_to_internal.end())
+ {
+ return Status{ ErrorCode::RUNTIME_ERROR, "Merge point does not exist" };
+ }
+ _merge_to_internal[merge_point] = t_id;
+ return Status{};
+}
+
+DependencyGraph::Id DependencyGraph::add_tensor(Id merge_tensor)
+{
+ Id new_tensor{ empty_id() };
+ if(merge_tensor != empty_id())
+ {
+ if(_merge_to_internal.find(merge_tensor) != _merge_to_internal.end())
+ {
+ new_tensor = _merge_to_internal[merge_tensor];
+ }
+ else
+ {
+ new_tensor = insert_new_tensor();
+ _merge_to_internal[merge_tensor] = new_tensor;
+ }
+ }
+ else
+ {
+ new_tensor = insert_new_tensor();
+ }
+ return new_tensor;
+}
+
+void DependencyGraph::remove_tensor(Id tensor)
+{
+ for(auto src_op : _adj_src_ops.at(tensor))
+ {
+ auto &dst_tensors = _adj_dst_tensors.at(src_op);
+ dst_tensors.erase(
+ std::remove(std::begin(dst_tensors), std::end(dst_tensors), tensor),
+ std::end(dst_tensors));
+ }
+ for(auto dst_op : _adj_dst_ops.at(tensor))
+ {
+ auto &src_tensors = _adj_src_tensors.at(dst_op);
+ src_tensors.erase(
+ std::remove(std::begin(src_tensors), std::end(src_tensors), tensor),
+ std::end(src_tensors));
+ }
+ _adj_src_ops.erase(tensor);
+ _adj_dst_ops.erase(tensor);
+}
+
+std::pair<Status, DependencyGraph::Id> DependencyGraph::add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs)
+{
+ Id new_op = insert_new_op();
+ for(Id tensor : inputs)
+ {
+ link_input(new_op, tensor);
+ }
+ for(Id tensor : outputs)
+ {
+ link_output(new_op, tensor);
+ }
+
+ // Use topological sort in order to detect possible loops / cycles.
+ // NOTE: This is unscalable. We'll need to have a better way of detecting loops or relax this invariant during operation, and add a validate method instead
+ return std::pair<Status, DependencyGraph::Id>(topological_sort().first, new_op);
+}
+
+void DependencyGraph::remove_operator(Id op)
+{
+ for(auto src_tensor : _adj_src_tensors.at(op))
+ {
+ auto &dst_ops = _adj_dst_ops.at(src_tensor);
+ dst_ops.erase(
+ std::remove(std::begin(dst_ops), std::end(dst_ops), op),
+ std::end(dst_ops));
+ }
+ for(auto dst_tensor : _adj_dst_tensors.at(op))
+ {
+ auto &src_ops = _adj_src_ops.at(dst_tensor);
+ src_ops.erase(
+ std::remove(std::begin(src_ops), std::end(src_ops), op),
+ std::end(src_ops));
+ }
+ _adj_src_tensors.erase(op);
+ _adj_dst_tensors.erase(op);
+}
+
+std::map<DependencyGraph::Id, DependencyGraph::Id> DependencyGraph::get_merge_points() const
+{
+ return _merge_to_internal;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::get_root_ops() const
+{
+ std::vector<Id> ops{};
+ const auto op_list = all_ops();
+
+ for(auto op : op_list)
+ {
+ if(src_ops(op).empty())
+ {
+ ops.emplace_back(op);
+ }
+ }
+ return ops;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::get_dst_ops() const
+{
+ std::vector<Id> ops{};
+ const auto op_list = all_ops();
+
+ for(auto op : op_list)
+ {
+ if(dst_ops(op).empty())
+ {
+ ops.emplace_back(op);
+ }
+ }
+ return ops;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::src_tensors(Id op) const
+{
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ return _adj_src_tensors.at(op);
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors(Id op) const
+{
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ return _adj_dst_tensors.at(op);
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::src_tensors() const
+{
+ std::vector<Id> tensors;
+ for(auto tensor_src_ops : _adj_src_ops)
+ {
+ if(tensor_src_ops.second.empty())
+ tensors.push_back(tensor_src_ops.first);
+ }
+ return tensors;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors() const
+{
+ std::vector<Id> tensors;
+ for(auto tensor_dst_ops : _adj_dst_ops)
+ {
+ if(tensor_dst_ops.second.empty())
+ tensors.push_back(tensor_dst_ops.first);
+ }
+ return tensors;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::src_ops_from_tensor(Id tensor) const
+{
+ return _adj_src_ops.at(tensor);
+}
+std::vector<DependencyGraph::Id> DependencyGraph::dst_ops_from_tensor(Id tensor) const
+{
+ return _adj_dst_ops.at(tensor);
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::all_ops() const
+{
+ std::vector<Id> ops{};
+ std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
+ {
+ return it.first;
+ });
+ return ops;
+}
+
+bool DependencyGraph::path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const
+{
+ for(auto child_op : dst_ops_from_tensor(src_tensor))
+ {
+ if(path_exists_from_op_to_op(child_op, dst_op))
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool DependencyGraph::path_exists_from_op_to_op(Id src_op, Id dst_op) const
+{
+ if(src_op == dst_op)
+ {
+ return true;
+ }
+ if(is_in(src_op, get_dst_ops()))
+ {
+ return false;
+ }
+ for(auto child_tensor : dst_tensors(src_op))
+ {
+ if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::all_tensors() const
+{
+ std::vector<Id> tensors{};
+ std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
+ {
+ return it.first;
+ });
+ return tensors;
+}
+
+unsigned int DependencyGraph::number_of_ops() const
+{
+ return _adj_src_tensors.size();
+}
+
+unsigned int DependencyGraph::number_of_tensors() const
+{
+ return _adj_src_ops.size();
+}
+
+DependencyGraph::Id DependencyGraph::insert_new_tensor()
+{
+ Id new_tensor = _tensor_id.alloc();
+ _adj_src_ops[new_tensor] = {};
+ _adj_dst_ops[new_tensor] = {};
+ return new_tensor;
+}
+DependencyGraph::Id DependencyGraph::insert_new_op()
+{
+ Id new_op = _operator_id.alloc();
+ _adj_src_tensors[new_op] = {};
+ _adj_dst_tensors[new_op] = {};
+ return new_op;
+}
+void DependencyGraph::link_input(Id op, Id in_tensor)
+{
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ ARM_COMPUTE_ERROR_ON(!tensor_exists(in_tensor));
+ ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor));
+ _adj_src_tensors[op].push_back(in_tensor);
+ _adj_dst_ops[in_tensor].push_back(op);
+}
+void DependencyGraph::link_output(Id op, Id out_tensor)
+{
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ ARM_COMPUTE_ERROR_ON(!tensor_exists(out_tensor));
+ ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor));
+ _adj_dst_tensors[op].push_back(out_tensor);
+ _adj_src_ops[out_tensor].push_back(op);
+}
+bool DependencyGraph::tensor_exists(Id tensor) const
+{
+ return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
+}
+bool DependencyGraph::operator_exists(Id op) const
+{
+ return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+}
+
+bool DependencyGraph::is_src_tensor(Id tensor) const
+{
+ if(!tensor_exists(tensor))
+ {
+ return false;
+ }
+ return _adj_src_ops.at(tensor).empty();
+}
+
+bool DependencyGraph::is_dst_tensor(Id tensor) const
+{
+ if(!tensor_exists(tensor))
+ {
+ return false;
+ }
+ return _adj_dst_ops.at(tensor).empty();
+}
+bool DependencyGraph::is_src_tensor_of(Id op, Id tensor) const
+{
+ if(!operator_exists(op) || !tensor_exists(tensor))
+ {
+ return false;
+ }
+ const auto op_inputs = src_tensors(op);
+ return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
+}
+bool DependencyGraph::is_dst_tensor_of(Id op, Id tensor) const
+{
+ if(!operator_exists(op) || !tensor_exists(tensor))
+ {
+ return false;
+ }
+ const auto op_outputs = dst_tensors(op);
+ return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
+}
+bool DependencyGraph::are_connected(Id op, Id tensor) const
+{
+ return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
+}
+std::vector<DependencyGraph::Id> DependencyGraph::src_ops(Id op) const
+{
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ std::vector<Id> ops{};
+ for(Id src_tensor : src_tensors(op))
+ {
+ ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
+ }
+ return ops;
+}
+
+std::vector<DependencyGraph::Id> DependencyGraph::dst_ops(Id op) const
+{
+ ARM_COMPUTE_ERROR_ON(!operator_exists(op));
+ std::vector<Id> ops{};
+ for(Id dst_tensor : _adj_dst_tensors.at(op))
+ {
+ ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
+ }
+ return ops;
+}
+
+std::pair<Status, std::vector<DependencyGraph::OpPack>> DependencyGraph::topological_sort() const
+{
+ // Incident degree (number of source operators to an op)
+ std::map<Id, unsigned int> in_degree{};
+ std::set<Id> visited_ops{};
+ std::deque<Id> zero_in_degree_ops{};
+ std::vector<OpPack> sorted_op_packs{};
+ for(auto op : all_ops())
+ {
+ const auto degree = src_ops(op).size();
+ in_degree[op] = degree;
+ if(degree == 0)
+ {
+ zero_in_degree_ops.push_back(op);
+ visited_ops.insert(op);
+ }
+ }
+
+ while(!zero_in_degree_ops.empty())
+ {
+ const Id op = zero_in_degree_ops.front();
+ zero_in_degree_ops.pop_front();
+ sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
+
+ for(const auto next_op : dst_ops(op))
+ {
+ if(in_degree[next_op] > 0)
+ {
+ in_degree[next_op]--;
+ }
+ if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
+ {
+ zero_in_degree_ops.push_back(next_op);
+ visited_ops.insert(op);
+ }
+ }
+ }
+
+ // If there are remaining ops with in_degree > 0, then it's indication that there are cycles in the graph
+ Status st{};
+ if(sorted_op_packs.size() != number_of_ops())
+ {
+ st = Status{ ErrorCode::RUNTIME_ERROR, "Cycles or loops are not allowed in a DependencyGraph" };
+ }
+ return std::make_pair(st, sorted_op_packs);
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h
new file mode 100644
index 0000000000..bfa2eacfed
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
+
+#include <cstddef>
+#include <unordered_map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+template <typename TDesc>
+class ITensorDescPack
+{
+public:
+ struct PackElement
+ {
+ PackElement() = default;
+ ~PackElement() = default;
+ PackElement(const PackElement &) = default;
+ PackElement &operator=(const PackElement &) = default;
+ PackElement(PackElement &&) = default;
+ PackElement &operator=(PackElement &&) = default;
+ PackElement(int id, TDesc *tensor)
+ : id(id), tensor(tensor), ctensor(nullptr)
+ {
+ }
+ PackElement(int id, const TDesc *ctensor)
+ : id(id), tensor(nullptr), ctensor(ctensor)
+ {
+ }
+
+ int id{ -1 };
+ TDesc *tensor{ nullptr };
+ const TDesc *ctensor{ nullptr };
+
+ friend bool operator==(const PackElement &elem0, const PackElement &elem1)
+ {
+ const bool same_ctensor = (elem0.tensor == nullptr && elem1.tensor == nullptr && elem0.ctensor != nullptr && elem1.ctensor != nullptr && *elem0.ctensor == *elem1.ctensor);
+ const bool same_tensor = (elem0.ctensor == nullptr && elem1.ctensor == nullptr && elem0.tensor != nullptr && elem1.tensor != nullptr && *elem0.tensor == *elem1.tensor);
+
+ return elem0.id == elem1.id && (same_ctensor || same_tensor);
+ }
+ };
+
+public:
+ /** Default Constructor */
+ ITensorDescPack() = default;
+ ~ITensorDescPack() = default;
+ ITensorDescPack<TDesc>(const ITensorDescPack<TDesc> &other) = default;
+ ITensorDescPack<TDesc> &operator=(const ITensorDescPack<TDesc> &other) = default;
+ ITensorDescPack<TDesc>(ITensorDescPack<TDesc> &&other) = default;
+ ITensorDescPack<TDesc> &operator=(ITensorDescPack<TDesc> &&other) = default;
+ /** Initializer list Constructor */
+ ITensorDescPack(std::initializer_list<PackElement> l)
+ : _pack{}
+ {
+ for(auto &e : l)
+ {
+ _pack[e.id] = e;
+ }
+ }
+ /** Add tensor to the pack
+ *
+ * @param[in] id ID/type of the tensor to add
+ * @param[in] tensor Tensor to add
+ */
+ void add_tensor(int id, TDesc *tensor)
+ {
+ _pack[id] = PackElement(id, tensor);
+ }
+
+ /** Add const tensor to the pack
+ *
+ * @param[in] id ID/type of the tensor to add
+ * @param[in] tensor Tensor to add
+ */
+ void add_const_tensor(int id, const TDesc *tensor)
+ {
+ _pack[id] = PackElement(id, tensor);
+ }
+ /** Get tensor of a given id from the pac
+ *
+ * @param[in] id ID of tensor to extract
+ *
+ * @return The pointer to the tensor if exist and is non-const else nullptr
+ */
+ TDesc *get_tensor(int id)
+ {
+ auto it = _pack.find(id);
+ return it != _pack.end() ? it->second.tensor : nullptr;
+ }
+ /** Get constant tensor of a given id
+ *
+ * @param[in] id ID of tensor to extract
+ *
+ * @return The pointer to the tensor if exist and is const else nullptr
+ */
+ const TDesc *get_const_tensor(int id) const
+ {
+ auto it = _pack.find(id);
+ if(it != _pack.end())
+ {
+ return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+ }
+ return nullptr;
+ }
+ /** Remove the tensor stored with the given id
+ *
+ * @param[in] id ID of tensor to remove
+ */
+ void remove_tensor(int id)
+ {
+ _pack.erase(id);
+ }
+ /** Pack size accessor
+ *
+ * @return Number of tensors registered to the pack
+ */
+ size_t size() const
+ {
+ return _pack.size();
+ }
+ /** Checks if pack is empty
+ *
+ * @return True if empty else false
+ */
+ bool empty() const
+ {
+ return _pack.empty();
+ }
+
+ /** Get the ACL_SRC_* tensors
+ *
+ * @return std::vector<TDesc *>
+ */
+ std::vector<TDesc *> get_src_tensors()
+ {
+ std::vector<TDesc *> src_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+ {
+ auto tensor = get_tensor(id);
+ if(tensor != nullptr)
+ {
+ src_tensors.push_back(tensor);
+ }
+ }
+ return src_tensors;
+ }
+ /** Get the const ACL_SRC_* tensors
+ *
+ * @return std::vector<const TDesc *>
+ */
+ std::vector<const TDesc *> get_const_src_tensors() const
+ {
+ std::vector<const TDesc *> src_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+ {
+ auto tensor = get_const_tensor(id);
+ if(tensor != nullptr)
+ {
+ src_tensors.push_back(tensor);
+ }
+ }
+ return src_tensors;
+ }
+ /** Get the ACL_DST_* tensors
+ *
+ * @return std::vector<TDesc *>
+ */
+ std::vector<TDesc *> get_dst_tensors()
+ {
+ std::vector<TDesc *> dst_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+ {
+ auto tensor = get_tensor(id);
+ if(tensor != nullptr)
+ {
+ dst_tensors.push_back(tensor);
+ }
+ }
+ return dst_tensors;
+ }
+ /** Get the const ACL_DST_* tensors
+ *
+ * @return std::vector<const TDesc *>
+ */
+ std::vector<const TDesc *> get_const_dst_tensors() const
+ {
+ std::vector<const TDesc *> dst_tensors{};
+ for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+ {
+ auto tensor = get_const_tensor(id);
+ if(tensor != nullptr)
+ {
+ dst_tensors.push_back(tensor);
+ }
+ }
+ return dst_tensors;
+ }
+
+ friend bool operator==(const ITensorDescPack<TDesc> &pack0, const ITensorDescPack<TDesc> &pack1)
+ {
+ return pack0._pack == pack1._pack;
+ }
+
+private:
+ std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
new file mode 100644
index 0000000000..4b91c0f156
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, MemoryType memory_type, AuxMemoryInfo memory_info,
+ DependencyGraph::Id &id)
+{
+ ARM_COMPUTE_UNUSED(op_graph);
+ return k_graph.add_kernel_tensor(op_tensor.desc, memory_type, memory_info, id, op_tensor.id);
+}
+
+Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, DependencyGraph::Id &id)
+{
+ // For a tensor t
+ // 1. If t is a src tensor of the entire op graph, then it's Core.
+ // (Optimisation opportunity, if we guanrantee that all translate methods are called in topological order, we can always assign t to Core.
+ // Because even if the op is non-root (which would mean t should be an Aux tensor), the src tensors would be already be determined by the ancestor ops (topological order), and thus would not be overriden by it)
+ // 2. If t is a dst tensor of the entire op graph, then it's Core.
+ // 3. Aux tensor with Persistent and Prepare lifetime is manually specified
+ // 4. All other ts not captured by the above are assigned Aux, with lifetime of Temporary.
+ // kernel_graph.add_kernel_tensor(input->desc, );
+ bool is_src_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.src_tensors());
+ bool is_dst_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.dst_tensors());
+ MemoryType memory_type;
+ AuxMemoryInfo memory_info;
+ if(is_src_tensor_of_graph || is_dst_tensor_of_graph)
+ {
+ memory_type = MemoryType::Core;
+ }
+ else
+ {
+ memory_type = MemoryType::Auxiliary;
+ memory_info.lifetime = AuxMemoryLifetime::Temporary;
+ memory_info.size = op_tensor.desc->total_size();
+ }
+ return add_kernel_tensor(k_graph, op_graph, op_tensor, memory_type, memory_info, id);
+}
+
+/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
+ *
+ * @note Duplicate of the function with the same name in src/gpu/cl/operators/ClConv2d.cpp
+ *
+ * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
+ *
+ * @param[in] gpu_target GPU target
+ *
+ * @return the suitable kernel size for using direct convolution method with NHWC data layout
+ */
+size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
+{
+ switch(gpu_target)
+ {
+ case arm_compute::GPUTarget::G76:
+ case arm_compute::GPUTarget::G77:
+ case arm_compute::GPUTarget::G78:
+ return 5;
+ case arm_compute::GPUTarget::G71:
+ case arm_compute::GPUTarget::G72:
+ case arm_compute::GPUTarget::MIDGARD:
+ case arm_compute::GPUTarget::BIFROST:
+ return 7;
+ default:
+ return 5;
+ }
+}
+} // namespace
+
+bool operator==(const OpTensor &t0, const OpTensor &t1)
+{
+ return std::make_tuple(t0.id()) == std::make_tuple(t1.id());
+}
+bool operator==(const Padding2D &pad0, const Padding2D &pad1)
+{
+ return std::make_tuple(pad0.top, pad0.right, pad0.bottom, pad0.left) == std::make_tuple(pad1.top, pad1.right, pad1.bottom, pad1.left);
+}
+bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1)
+{
+ return std::make_tuple(conv2d0.pad, conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.pad, conv2d1.stride, conv2d1.dilation);
+}
+
+bool operator==(const AddDescriptor &, const AddDescriptor &)
+{
+ return std::make_tuple() == std::make_tuple(); // Currently two Add ops are always the same
+}
+
+bool Conv2dContent::operator==(const OperatorContent &other) const
+{
+ const auto converted = *utils::cast::polymorphic_downcast<const Conv2dContent *>(&other);
+ return desc == converted.desc;
+}
+
+bool AddContent::operator==(const OperatorContent &other) const
+{
+ const auto converted = *utils::cast::polymorphic_downcast<const AddContent *>(&other);
+ return desc == converted.desc;
+}
+
+ConvolutionMethod Conv2dContent::select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target)
+{
+ // Modified from ClConv2d::get_convolution_method
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+
+ const PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{});
+ const Size2D dilation = conv2d_desc.dilation;
+
+ const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+ /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+ using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
+ using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+ const std::vector<ConfigurationMethod> known_configs =
+ {
+ // Alexnet
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+ // VGG16 / VGG19
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+ // Mobilenet 224
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+ // Mobilenet 160
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+ // Mobilenet 224
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+ // Mobilenet 160
+ ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+ };
+
+ const auto find_config = [&](ConfigurationMethod c)
+ {
+ const ConvolutionConfiguration config = c.first;
+ const PadStrideInfo info = std::get<3>(config);
+ const DataLayout data_layout = std::get<4>(config);
+
+ return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+ && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == legacy_pad_stride.pad_top() && info.pad_right() == legacy_pad_stride.pad_right()
+ && info.pad_bottom() == legacy_pad_stride.pad_bottom() && info.pad_left() == legacy_pad_stride.pad_left() && info.stride() == legacy_pad_stride.stride() && (data_layout == src->data_layout());
+ };
+
+ std::vector<ConfigurationMethod>::const_iterator found;
+ if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ {
+ return (*found).second;
+ }
+
+ if(dilation != Size2D(1U, 1U))
+ {
+ return ConvolutionMethod::GEMM;
+ }
+ else
+ {
+ if(src->data_layout() == DataLayout::NCHW)
+ {
+ ARM_COMPUTE_ERROR("NCHW not supported");
+ }
+ else
+ {
+ const bool is_direct_valid = bool(ClDirectConv2dKernel::validate(src, weights, nullptr, dst, ClDirectConv2dKernelDescriptor{ conv2d_desc }));
+ const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
+
+ // SRGAN case
+ if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv2d_desc.pad.top < 3)
+ && is_direct_valid)
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+
+ // Floating-point case: GeMM/Direct
+ if(is_data_type_float(src->data_type()))
+ {
+ // Get dst shape
+ TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride);
+ const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+ const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
+ const bool is_ofm_lte_8 = weights->dimension(3U) <= 8;
+ const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+ const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
+
+ // Direct convolution case
+ if(is_direct_valid)
+ {
+ if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || gpu_target == arm_compute::GPUTarget::MIDGARD))
+ {
+ if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+ }
+ else
+ {
+ if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+ {
+ return ConvolutionMethod::DIRECT;
+ }
+ }
+ }
+
+ // Default case
+ return ConvolutionMethod::GEMM;
+ }
+
+ // Generic case for quantized. Only GeMM
+ return ConvolutionMethod::GEMM;
+ }
+ }
+ return ConvolutionMethod::DIRECT;
+}
+
+Status Conv2dContent::translate(ClKernelGraph &kernel_graph) const
+{
+ const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+ const auto method = forced_method_enabled ? forced_method : Conv2dContent::select_conv_method(input->desc, weight->desc, dst->desc, desc, CLScheduler::get().target());
+ switch(method)
+ {
+ case ConvolutionMethod::DIRECT:
+ {
+ return translate_direct_conv2d(kernel_graph);
+ }
+ default:
+ {
+ ARM_COMPUTE_RETURN_ERROR_MSG("Not implemented");
+ }
+ }
+ return Status{};
+}
+Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const
+{
+ const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2);
+ const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst);
+
+ ITensorDescPack<ClKernelTensor> tensors;
+
+ DependencyGraph::Id input_id;
+ auto st = add_kernel_tensor(kernel_graph, *_graph, *input, input_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(input_id));
+
+ DependencyGraph::Id weight_id;
+ st = add_kernel_tensor(kernel_graph, *_graph, *weight, weight_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(weight_id));
+
+ if(bias != nullptr)
+ {
+ DependencyGraph::Id bias_id;
+ st = add_kernel_tensor(kernel_graph, *_graph, *bias, bias_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_SRC_2, kernel_graph.get_tensor(bias_id));
+ }
+
+ DependencyGraph::Id dst_id;
+ st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
+
+ DependencyGraph::Id direct_conv2d_id;
+ const auto kernel_desc = ClDirectConv2dKernelDescriptor{ desc };
+
+ st = ClDirectConv2dKernel::validate(input->desc, weight->desc, bias == nullptr ? nullptr : bias->desc, dst->desc, kernel_desc);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+ ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
+ st = kernel_graph.add_kernel<ClDirectConv2dKernel>(config, kernel_desc, tensors, direct_conv2d_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ ARM_COMPUTE_UNUSED(direct_conv2d_id);
+
+ return Status{};
+}
+
+Status AddContent::translate(ClKernelGraph &kernel_graph) const
+{
+ const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+
+ ITensorDescPack<ClKernelTensor> tensors;
+
+ DependencyGraph::Id lhs_id;
+ auto st = add_kernel_tensor(kernel_graph, *_graph, *lhs, lhs_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(lhs_id));
+
+ DependencyGraph::Id rhs_id;
+ st = add_kernel_tensor(kernel_graph, *_graph, *rhs, rhs_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(rhs_id));
+
+ DependencyGraph::Id dst_id;
+ st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
+
+ DependencyGraph::Id add_id;
+ ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
+
+ st = ClAddKernel::validate(lhs->desc, rhs->desc, dst->desc);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+
+ st = kernel_graph.add_kernel<ClAddKernel>(config, ClEltwiseAddKernelDescriptor{ desc }, tensors, add_id);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ ARM_COMPUTE_UNUSED(add_id);
+
+ return Status{};
+}
+
+std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph)
+{
+ std::vector<const OperatorContent *> ops;
+ const auto sorted = graph.graph.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ ops.push_back(graph.operators.at(pack.op).get());
+ }
+ return ops;
+}
+
+std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph)
+{
+ std::vector<OperatorContent *> ops;
+ const auto sorted = graph.graph.topological_sort();
+ for(const auto &pack : sorted.second)
+ {
+ ops.push_back(graph.operators.at(pack.op).get());
+ }
+ return ops;
+}
+
+Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph)
+{
+ for(const auto &op : traverse(op_graph))
+ {
+ const auto st = op->translate(kernel_graph);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ }
+ return Status{};
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
new file mode 100644
index 0000000000..c33e189797
--- /dev/null
+++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
+
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h"
+
+#include "support/Cast.h"
+#include "support/DeepCopy.h"
+
+#include <map>
+#include <tuple>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+enum class OperatorComplexity
+{
+ Complex = 0,
+ Simple
+};
+
+struct ClKernelGraph;
+struct OpTensorContent
+{
+public:
+ using Id = DependencyGraph::Id;
+ OpTensorContent() = default;
+ OpTensorContent(Id id)
+ : id{ id }, desc{}
+ {
+ }
+ OpTensorContent(Id id, ITensorInfo *desc)
+ : id{ id }, desc{ desc }
+ {
+ }
+ ~OpTensorContent() = default;
+ OpTensorContent(const OpTensorContent &) = default;
+ OpTensorContent &operator=(const OpTensorContent &) = default;
+ OpTensorContent(OpTensorContent &&) = default;
+ OpTensorContent &operator=(OpTensorContent &&) = default;
+ bool operator==(const OpTensorContent &other) const
+ {
+ return desc == other.desc;
+ }
+
+ const ITensorInfo *get_tensor_info() const
+ {
+ return desc;
+ }
+ ITensorInfo *get_tensor_info()
+ {
+ return desc;
+ }
+
+ Id id{};
+ ITensorInfo *desc{};
+};
+
+struct OperatorContent
+{
+public:
+ using Id = DependencyGraph::Id;
+ OperatorContent() = default;
+ OperatorContent(const OperatorGraph::Implementation *graph, Id id, const ITensorDescPack<OpTensorContent> &tensors)
+ : _graph{ graph }, _id{ id }, _tensors{ tensors }
+ {
+ }
+ OperatorContent(const OperatorContent &op) = default;
+ OperatorContent &operator=(const OperatorContent &op) = default;
+ OperatorContent(OperatorContent &&op) = default;
+ OperatorContent &operator=(OperatorContent &&op) = default;
+ virtual ~OperatorContent() = default;
+ virtual OperatorComplexity complexity() const = 0;
+ virtual bool operator==(const OperatorContent &other) const = 0;
+ virtual Status translate(ClKernelGraph &kernel_graph) const = 0;
+
+protected:
+ const OperatorGraph::Implementation *_graph {};
+ Id _id{};
+ ITensorDescPack<OpTensorContent> _tensors{};
+};
+
+struct Conv2dContent : public OperatorContent
+{
+public:
+ Conv2dContent() = default;
+ Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+ : OperatorContent(graph, id, tensors), desc(desc), forced_method(), forced_method_enabled(false)
+ {
+ }
+ // Temporary. Do not need to pass ConvolutionMethod
+ Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors, ConvolutionMethod method)
+ : OperatorContent(graph, id, tensors), desc(desc), forced_method(method), forced_method_enabled(true)
+ {
+ }
+ ~Conv2dContent() = default;
+ Conv2dContent(const Conv2dContent &) = default;
+ Conv2dContent &operator=(const Conv2dContent &) = default;
+ Conv2dContent(Conv2dContent &&) = default;
+ Conv2dContent &operator=(Conv2dContent &&) = default;
+ bool operator==(const OperatorContent &other) const override;
+ OperatorComplexity complexity() const override
+ {
+ return OperatorComplexity::Complex;
+ }
+ void set_method(ConvolutionMethod method)
+ {
+ forced_method_enabled = true;
+ forced_method = method;
+ }
+
+ Status translate(ClKernelGraph &kernel_graph) const override;
+ /** Replicate heuristics of @ref ClConv2d::get_convolution_method(), except that non-supported data types and data layouts are removed from the heuristics
+ *
+ * @param src
+ * @param weights
+ * @param dst
+ * @param conv2d_desc
+ * @param gpu_target
+ * @return ConvolutionMethod
+ */
+ static ConvolutionMethod select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target);
+
+ Conv2dDescriptor desc{};
+ ConvolutionMethod forced_method{ ConvolutionMethod::GEMM_CONV2D };
+ bool forced_method_enabled{ false };
+
+private:
+ Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const;
+};
+
+class AddContent : public OperatorContent
+{
+public:
+ AddContent() = default;
+ AddContent(const OperatorGraph::Implementation *graph, Id id, const AddDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
+ : OperatorContent(graph, id, tensors), desc(desc)
+ {
+ }
+ ~AddContent() = default;
+ AddContent(const AddContent &) = default;
+ AddContent &operator=(const AddContent &) = default;
+ AddContent(AddContent &&) = default;
+ AddContent &operator=(AddContent &&) = default;
+ bool operator==(const OperatorContent &other) const override;
+ OperatorComplexity complexity() const override
+ {
+ return OperatorComplexity::Simple;
+ }
+ Status translate(ClKernelGraph &kernel_graph) const override;
+
+private:
+ AddDescriptor desc{};
+};
+
+struct OperatorGraph::Implementation
+{
+public:
+ template <typename ContentT, typename... Args>
+ void add_node(Operator::Id id, Args &&... args)
+ {
+ operators[id] = utils::memory::make_deep_unique<OperatorContent, ContentT>(this, id, std::forward<Args>(args)...);
+ }
+
+ template <typename... Args>
+ void add_tensor(OpTensor::Id id, Args &&... args)
+ {
+ tensors[id] = utils::memory::make_deep_unique<OpTensorContent, OpTensorContent>(id, std::forward<Args>(args)...);
+ }
+
+ using Dependency = DependencyGraph;
+ using OperatorMap = std::map<Operator::Id, utils::memory::deep_unique_ptr<OperatorContent>>;
+ using OpTensorMap = std::map<OpTensor::Id, utils::memory::deep_unique_ptr<OpTensorContent>>;
+
+ Implementation() = default;
+ ~Implementation() = default;
+
+ friend bool operator==(const OperatorGraph::Implementation &graph0, const OperatorGraph::Implementation &graph1)
+ {
+ return graph0.graph == graph1.graph && graph0.operators == graph1.operators && graph0.tensors == graph1.tensors;
+ }
+
+ Dependency graph{};
+ OperatorMap operators{};
+ OpTensorMap tensors{};
+ Status status{};
+};
+
+std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph);
+
+std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph);
+
+Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL \ No newline at end of file
diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
index 472cfb9df0..6c8e4abde7 100644
--- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
+++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
@@ -21,13 +21,18 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
+
#include "arm_compute/core/CL/ICLTensor.h"
#include "src/core/CL/CLUtils.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
#include "src/gpu/cl/ClKernelLibrary.h"
+#include "support/Cast.h"
namespace arm_compute
{
namespace experimental
@@ -57,81 +62,88 @@ void ClCompositeKernel::configure(const ClCompileContext &compile_ctx, const ClK
_arguments = cl_code.arguments;
}
-inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgRuntimeDescriptor &arg, ICLTensor *tensor, const Window &arg_slice)
+inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
{
switch(arg.tensor_arg_type)
{
- case TensorArgType::Scalar:
+ case ClKernelTensorArgType::Scalar:
{
ARM_COMPUTE_ERROR("Unsupported yet");
break;
}
- case TensorArgType::Vector:
+
+ case ClKernelTensorArgType::Vector:
{
add_1D_tensor_argument(idx, tensor, arg_slice);
break;
}
- case TensorArgType::Image:
+ case ClKernelTensorArgType::Image:
{
add_2D_tensor_argument(idx, tensor, arg_slice);
break;
}
- case TensorArgType::Image_Reinterpret_As_3D:
+ case ClKernelTensorArgType::Image_Reinterpret_As_3D:
{
add_2D_tensor_argument(idx, tensor, arg_slice);
const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
break;
}
- case TensorArgType::Image_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Image_Export_To_ClImage2D:
{
const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+ cl_images.push_back(tensor_image2d);
_kernel.setArg(idx++, tensor_image2d);
break;
}
- case TensorArgType::Image_3D:
+
+ case ClKernelTensorArgType::Image_3D:
{
add_2D_tensor_argument(idx, tensor, arg_slice);
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
break;
}
- case TensorArgType::Image_3D_Export_To_ClImage2D:
+ case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
{
const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
+ cl_images.push_back(tensor_image2d);
_kernel.setArg(idx++, tensor_image2d);
_kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
break;
}
- case TensorArgType::Tensor_3D:
+
+ case ClKernelTensorArgType::Tensor_3D:
{
add_3D_tensor_argument(idx, tensor, arg_slice);
break;
}
- case TensorArgType::Tensor_4D:
+
+ case ClKernelTensorArgType::Tensor_4D:
{
add_4D_tensor_argument(idx, tensor, arg_slice);
break;
}
- case TensorArgType::Tensor_4D_t_Buffer:
+ case ClKernelTensorArgType::Tensor_4D_t_Buffer:
{
add_4d_tensor_nhwc_argument(idx, tensor);
break;
}
- case TensorArgType::Tensor_4D_t_Image:
+ case ClKernelTensorArgType::Tensor_4D_t_Image:
{
const size_t image_w = tensor->info()->dimension(0) / 4;
const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1);
const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
- cl::Image2D tensor_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
- TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+ cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
+ TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
+ cl_images.push_back(tensor_image2d);
- _kernel.setArg(idx++, tensor_cl_image);
+ _kernel.setArg(idx++, tensor_image2d);
add_4d_tensor_nhwc_argument(idx, tensor);
break;
}
@@ -142,7 +154,7 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe
}
}
-void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc)
+void ClCompositeKernel::run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc)
{
ARM_COMPUTE_UNUSED(exec_desc);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -160,17 +172,21 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w
{
// Set kernel arguments
Window arg_slice = slice;
- for(auto arg : _arguments)
+ // CLImages created from tensor arguments. Need to be retained until enqueue
+ std::vector<cl::Image2D> cl_images;
+ for(auto id_arg : _arguments)
{
- auto tensor = tensors._binding.at(arg.arg_id);
+ const auto arg = id_arg.second;
+ auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.arg_id));
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
if(!arg.slide_along_dimz)
{
// The stride_z for matrix must be zero if we do not slice
ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
arg_slice = slice_fixed_z;
}
- add_tensor_argument(idx, arg, tensor, arg_slice);
+ add_tensor_argument(idx, arg, tensor, arg_slice, cl_images);
}
// Dispatch kernel
@@ -180,12 +196,6 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w
while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice));
}
-Status bind_arguments(ITensorPack &, const ClKernelCode &, const TensorBinding &)
-{
- return Status{};
-}
} // namespace dynamic_fusion
} // namespace experimental
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
index 19efb505eb..bf70d6a226 100644
--- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
+++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
@@ -21,13 +21,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-
+#include "arm_compute/core/experimental/ClWorkload.h"
#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/IClKernel.h"
@@ -37,47 +38,40 @@ namespace experimental
{
namespace dynamic_fusion
{
-struct TensorBinding
-{
- TensorBinding(const std::map<ArgumentID, ICLTensor *> binding)
- : _binding{ binding }
- {
- }
- bool empty() const
- {
- return _binding.empty();
- }
- std::map<ArgumentID, ICLTensor *> _binding;
-};
-class ClCompositeKernel : public opencl::IClKernel
+struct ClExecutionDescriptor;
+struct ClKernelCode;
+
+class ClCompositeKernel final : public opencl::IClKernel
{
public:
void configure(const opencl::ClCompileContext &, const ClKernelCode &);
/** Run the composite kernel
+ * @note The slots / keys in ITensorPack are the argument Ids of the tensors in blueprint
*
- * @param tensors TensorBinding object containing run-time tensors information
+ * @param tensors ITensorPack object containing run-time tensor memories
* @param window Execution window
* @param queue OpenCL Command queue
* @param exec_desc Descriptor containing execution information
*/
- virtual void run_composite_op(TensorBinding &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override;
+ virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override;
private:
- inline void add_tensor_argument(unsigned int &idx, const ClKernelArgRuntimeDescriptor &arg, ICLTensor *tensor, const Window &arg_slice);
+ /** Set a kernel tensor argument
+ *
+ * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+ * @param[in] arg Kernel argument descriptor accompanying @p tensor
+ * @param[in] tensor Tensor to set as an argument of the object's kernel.
+ * @param[in] arg_slice Window the kernel will be run on.
+ * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
+ */
+ inline void add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
private:
ClKernelArgList _arguments{}; /** All kernel arguments required by runtime */
};
-/** Argument Binding.
- * Tensor Arguments to ICLKernel run_op method need to be passed via an ITensorPack. So the bind_arguments is essentially a converter from TensorBinding to ITensorPack
- */
-Status bind_arguments(ITensorPack &tensor_pack, const ClKernelCode &, const TensorBinding &);
-
} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H \ No newline at end of file
diff --git a/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp b/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp
new file mode 100644
index 0000000000..984de74249
--- /dev/null
+++ b/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
+
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+namespace
+{
+Status add_tensor_to_tensor_pack(int wk_tensor_id, ICLTensor *tensor, const ClWorkload &workload, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map)
+{
+ if(tensor == nullptr)
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
+ }
+ const auto bp_tensor_id = workload.tensors.at(wk_tensor_id).kernel_arg.arg_id; // blueprint tensor id
+ std::vector<ClWorkload::UnitWorkId> uwk_ids{};
+ const auto src_uwk_ids = workload.graph.src_ops_from_tensor(wk_tensor_id);
+ const auto dst_uwk_ids = workload.graph.dst_ops_from_tensor(wk_tensor_id);
+ uwk_ids.insert(uwk_ids.end(), src_uwk_ids.begin(), src_uwk_ids.end());
+ uwk_ids.insert(uwk_ids.end(), dst_uwk_ids.begin(), dst_uwk_ids.end());
+
+ for(auto uwk_id : uwk_ids)
+ {
+ TensorPackMap *pack_map = nullptr;
+ const auto uwk_stage = workload.unit_workloads.at(uwk_id).stage.stage;
+ switch(uwk_stage)
+ {
+ case UnitWorkloadStage::Stage::Run:
+ pack_map = &run_pack_map;
+ break;
+ case UnitWorkloadStage::Stage::Prepare:
+ pack_map = &prepare_pack_map;
+ break;
+ default:
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported workload stage");
+ }
+
+ ITensorPack *tensor_pack = pack_map->find_tensor_pack(uwk_id);
+ if(tensor_pack == nullptr)
+ {
+ pack_map->add_tensor_pack(uwk_id, ITensorPack{ { bp_tensor_id, tensor } });
+ }
+ else
+ {
+ tensor_pack->add_tensor(bp_tensor_id, tensor);
+ }
+ }
+ return Status{};
+}
+
+} // namespace
+
+ITensorPack *TensorPackMap::find_tensor_pack(UnitWorkload::Id uwk_id)
+{
+ auto tensor_pack = _tensor_packs.find(uwk_id);
+ if(tensor_pack != _tensor_packs.end())
+ {
+ return &(tensor_pack->second);
+ }
+ return nullptr;
+}
+
+ITensorPack &TensorPackMap::get_tensor_pack(UnitWorkload::Id uwk_id)
+{
+ return _tensor_packs.at(uwk_id);
+}
+
+void TensorPackMap::add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack)
+{
+ _tensor_packs[uwk_id] = tensor_pack;
+}
+
+Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors)
+{
+ for(auto tensor : workload.tensors)
+ {
+ const auto wk_tensor_id = tensor.first; // workload tensor id
+ ICLTensor *tensor_object = nullptr;
+ if(tensor.second.memory_type == MemoryType::Core)
+ {
+ const auto op_tensor_id = workload.op_tensor_id_lut.at(wk_tensor_id);
+ auto op_tensor_find = op_tensors.find(op_tensor_id);
+ if(op_tensor_find == op_tensors.end())
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Cannot find binding for some operator tensor");
+ }
+ tensor_object = utils::cast::polymorphic_downcast<ICLTensor *>(op_tensor_find->second);
+ }
+ else if(tensor.second.memory_type == MemoryType::Auxiliary)
+ {
+ // Create aux tensor CLTensor object
+ const TensorInfo tensor_info = *tensor.second.info;
+ const auto memory_info = tensor.second.memory_info;
+ tensor_object = aux_tensor_data.add_aux_tensor(wk_tensor_id, tensor_info, memory_info);
+ }
+ else
+ {
+ return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported tensor memory type");
+ }
+
+ const auto st = add_tensor_to_tensor_pack(wk_tensor_id, tensor_object, workload, prepare_pack_map, run_pack_map);
+ ARM_COMPUTE_RETURN_ON_ERROR(st);
+ }
+ return Status{};
+}
+
+CLTensor *ClAuxTensorData::add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
+{
+ auto find_tensor_pair = _owned_tensors.find(tensor_id);
+ if(find_tensor_pair == _owned_tensors.end())
+ {
+ return find_tensor_pair->second.get();
+ }
+ else
+ {
+ auto tensor = std::make_unique<CLTensor>();
+ auto inserted_pair = _owned_tensors.emplace(tensor_id, std::move(tensor)).first;
+ auto new_tensor = inserted_pair->second.get();
+ _tensors.emplace_back(new_tensor, tensor_info, memory_info);
+ return new_tensor;
+ }
+}
+
+std::vector<ClAuxTensorData::DataView> &ClAuxTensorData::get_tensors()
+{
+ return _tensors;
+}
+struct ClCompositeOperator::Implementation
+{
+ std::map<UnitWorkload::Id, std::unique_ptr<ClCompositeKernel>> _kernels{};
+ std::map<UnitWorkload::Id, std::unique_ptr<ClCompositeKernel>> _kernels_prep{};
+ ClWorkload _workload{};
+ bool _is_prepared{ false };
+};
+
+ClCompositeOperator::ClCompositeOperator()
+ : _impl{ std::make_unique<Implementation>() }
+{
+}
+
+ClCompositeOperator::~ClCompositeOperator() = default;
+
+void ClCompositeOperator::configure(const CLCompileContext &ctx, const ClWorkload &workload)
+{
+ ARM_COMPUTE_ERROR_THROW_ON(ClCompositeOperator::validate(workload));
+ _impl->_workload = workload;
+
+ // Traverse workloads in topological order
+ const auto sorted = workload.graph.topological_sort().second;
+ for(const auto &node : sorted)
+ {
+ auto work = workload.unit_workloads.at(node.op);
+ auto stage = work.stage.stage;
+ auto k = std::make_unique<ClCompositeKernel>();
+ k->configure(ctx, work.code);
+
+ switch(stage)
+ {
+ case UnitWorkloadStage::Stage::Run:
+ _impl->_kernels.emplace(work.id, std::move(k));
+ break;
+ case UnitWorkloadStage::Stage::Prepare:
+ _impl->_kernels_prep.emplace(work.id, std::move(k));
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Invalid stage");
+ }
+ break;
+ }
+}
+
+Status ClCompositeOperator::validate(const ClWorkload &workload)
+{
+ return workload.status;
+}
+
+void ClCompositeOperator::prepare(TensorPackMap &tensor_pack_map)
+{
+ if(!_impl->_is_prepared)
+ {
+ for(auto &id_kernel_pair : _impl->_kernels_prep)
+ {
+ const bool flush_queue = false;
+ const auto uwk_id = id_kernel_pair.first;
+ auto kernel = id_kernel_pair.second.get();
+ CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue);
+ }
+
+ _impl->_is_prepared = true;
+ }
+}
+
+void ClCompositeOperator::run(TensorPackMap &tensor_pack_map)
+{
+ ARM_COMPUTE_ERROR_ON_MSG(!_impl->_is_prepared, "Operator is not prepared");
+
+ for(auto &id_kernel_pair : _impl->_kernels)
+ {
+ // Flush the command queue on the last kernel
+ const bool flush_queue = false;
+ const auto uwk_id = id_kernel_pair.first;
+ auto kernel = id_kernel_pair.second.get();
+ CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue);
+ }
+}
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 4cff707f1a..26124ed7e9 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -191,7 +191,7 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush)
+void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush)
{
ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
"The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
@@ -246,7 +246,7 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-void CLScheduler::enqueue_op(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush)
+void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush)
{
enqueue_common(kernel, tensors, exec_desc, flush);
}
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 81fe7dbde6..8ce5177847 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -68,7 +68,7 @@ private:
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
struct CompositeKernelData : public CLTuner::IKernelData
{
- CompositeKernelData(experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+ CompositeKernelData(ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
: _tensors{ tensors }, _exec_desc{ exec_desc }
{
}
@@ -80,7 +80,7 @@ struct CompositeKernelData : public CLTuner::IKernelData
}
private:
- experimental::dynamic_fusion::TensorBinding &_tensors;
+ ITensorPack &_tensors;
const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc;
};
#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
@@ -166,7 +166,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
}
#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
{
CompositeKernelData data{ tensors, exec_desc };
diff --git a/support/DeepCopy.h b/support/DeepCopy.h
new file mode 100644
index 0000000000..0117897901
--- /dev/null
+++ b/support/DeepCopy.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_MISC_ITERABLE_H
+#define ARM_COMPUTE_MISC_ITERABLE_H
+namespace arm_compute
+{
+namespace utils
+{
+namespace memory
+{
+namespace
+{
+/** Default polymorphic deep copy function, used by deep_unique_ptr
+ *
+ * @param ptr Potentially polymorphic object to be deep copied
+ * @return template <typename Base, typename Derived>*
+ */
+template <typename Base, typename Derived>
+Base *default_polymorphic_copy(const Base *ptr)
+{
+ static_assert(std::is_base_of<Base, Derived>::value,
+ "Derived is not a specialization of Base");
+ if(ptr == nullptr)
+ {
+ return nullptr;
+ }
+ return new Derived(*static_cast<const Derived *>(ptr));
+}
+} // namespace
+
+/** A deep-copying unique pointer that also supports polymorphic cloning behavior
+ *
+ * @note The == operator compares the dereferenced value instead of the pointer itself.
+ *
+ * @tparam Base Base type
+ */
+template <typename Base>
+class deep_unique_ptr
+{
+public:
+ using CopyFunc = std::function<Base *(const Base *)>;
+
+ deep_unique_ptr(std::nullptr_t val = nullptr) noexcept
+ : _val{ val },
+ _copy{}
+ {
+ }
+ template <typename Derived, typename CopyFuncDerived>
+ deep_unique_ptr(Derived *value, const CopyFuncDerived &copy) noexcept
+ : _val{ value },
+ _copy{ std::move(copy) }
+ {
+ static_assert(std::is_base_of<Base, Derived>::value,
+ "Derived is not a specialization of Base");
+ static_assert(
+ std::is_constructible<CopyFunc, CopyFuncDerived>::value,
+ "CopyFuncDerived is not valid for a copy functor");
+ }
+
+ deep_unique_ptr(const deep_unique_ptr<Base> &ptr)
+ : deep_unique_ptr(ptr.clone())
+ {
+ }
+ deep_unique_ptr &operator=(const deep_unique_ptr<Base> &ptr)
+ {
+ deep_unique_ptr<Base> tmp(ptr);
+ swap(*this, tmp);
+ return *this;
+ }
+
+ deep_unique_ptr(deep_unique_ptr<Base> &&ptr) = default;
+ deep_unique_ptr &operator=(deep_unique_ptr<Base> &&ptr) = default;
+ ~deep_unique_ptr() = default;
+ friend void swap(deep_unique_ptr &ptr0, deep_unique_ptr<Base> &ptr1) noexcept
+ {
+ using std::swap;
+ swap(ptr0._val, ptr1._val);
+ swap(ptr0._copy, ptr1._copy);
+ }
+ Base &operator*() noexcept
+ {
+ return *_val;
+ }
+
+ const Base &operator*() const noexcept
+ {
+ return *_val;
+ }
+
+ Base *operator->() noexcept
+ {
+ return _val.operator->();
+ }
+
+ const Base *operator->() const noexcept
+ {
+ return _val.operator->();
+ }
+
+ Base *get() noexcept
+ {
+ return _val.get();
+ }
+ const Base *get() const noexcept
+ {
+ return _val.get();
+ }
+
+ explicit operator bool() const noexcept
+ {
+ return static_cast<bool>(_val);
+ }
+
+ bool operator==(const deep_unique_ptr<Base> &rhs) const
+ {
+ if(rhs.get() == nullptr && _val == nullptr)
+ {
+ return true;
+ }
+ else if(rhs.get() == nullptr || _val == nullptr)
+ {
+ return false;
+ }
+ else
+ {
+ return (*_val == *rhs);
+ }
+ }
+
+private:
+ deep_unique_ptr clone() const
+ {
+ return { _copy(_val.get()), CopyFunc(_copy) };
+ }
+ std::unique_ptr<Base> _val{ nullptr };
+ CopyFunc _copy{};
+};
+
+/** Utility function to create a polymorphic deep-copying unique pointer
+ *
+ * @tparam Base
+ * @tparam Derived
+ * @tparam CopyFunc
+ * @param temp
+ * @param copy
+ * @return deep_unique_ptr<Base>
+ */
+template <typename Base, typename Derived, typename CopyFunc>
+deep_unique_ptr<Base> make_deep_unique(Derived &&temp, CopyFunc copy)
+{
+ return
+ {
+ new Derived(std::move(temp)),
+ CopyFunc{ std::move(copy) }
+ };
+}
+
+template <typename Base, typename Derived>
+deep_unique_ptr<Base> make_deep_unique(Derived &&temp)
+{
+ static_assert(std::is_base_of<Base, Derived>::value,
+ "Derived is not a specialization of Base");
+
+ return make_deep_unique<Base, Derived>(
+ std::move(temp), default_polymorphic_copy<Base, Derived>);
+}
+
+template <typename Base, typename Derived, typename... Args>
+deep_unique_ptr<Base> make_deep_unique(Args &&... args)
+{
+ static_assert(std::is_constructible<Derived, Args...>::value,
+ "Cannot instantiate Derived from arguments");
+
+ return make_deep_unique<Base, Derived>(
+ std::move(Derived{ std::forward<Args>(args)... }));
+}
+
+} // namespace memory
+} // namespace utils
+} // namespace arm_compute
+#endif // ARM_COMPUTE_MISC_ITERABLE_H \ No newline at end of file
diff --git a/tests/SConscript b/tests/SConscript
index 62fa4fce11..87907f40fc 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -281,6 +281,20 @@ if test_env['benchmark_examples']:
#-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
arm_compute_benchmark_examples += [ prog ]
+
+ # Dynamic fusion examples
+ if env['opencl']:
+ if env['experimental_dynamic_fusion']:
+ for file in Glob("%s/dynamic_fusion/*.cpp" % examples_folder):
+ example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0])
+ if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
+ prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+[load_whole_archive, arm_compute_lib, noload_whole_archive] + bm_link_flags + extra_link_flags)
+ arm_compute_benchmark_examples += [ prog ]
+ else:
+ #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
+ prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
+ arm_compute_benchmark_examples += [ prog ]
+
arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples)
Depends(arm_compute_benchmark_examples, arm_compute_test_framework)
Depends(arm_compute_benchmark_examples, arm_compute_lib)
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
index 9e1b4d897b..a6b09ccdea 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
@@ -21,9 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
+#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
#include "src/core/utils/helpers/float_ops.h"
#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
@@ -42,9 +45,12 @@
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+
#include <chrono>
using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
namespace arm_compute
{
@@ -52,149 +58,12 @@ namespace test
{
namespace validation
{
-namespace
-{
-/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */
-#define TICK(clock_name) \
- auto clock_name##_tick = std::chrono::high_resolution_clock::now();
-#define TOCK(clock_name, measurement_map) \
- auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
- measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
-#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
- auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
- measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
-
-template <typename T, typename U>
-void fill(U &&tensor, int seed)
-{
- static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
- using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
-
- DistributionType distribution{ T(-1.0f), T(1.0f) };
- library->fill(tensor, distribution, seed);
-
- // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
- DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
- library->fill_borders_with_garbage(tensor, distribution_inf, seed);
-}
-} // namespace
-
TEST_SUITE(CL)
TEST_SUITE(UNIT)
TEST_SUITE(DYNAMIC_FUSION)
TEST_SUITE(ClCompositeKernel)
TEST_SUITE(Validate)
-TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL)
-{
- /* Computation:
- * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast)
- */
- const auto data_type = DataType::F32;
- const auto m = 5U;
- const auto n = 4U;
- const auto k = 3U;
- const auto t_lhs_shape = TensorShape(k, m);
- const auto t_rhs_shape = TensorShape(n, k);
- const auto t_dst_shape = TensorShape(n, m);
- auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
- auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
- auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32);
- auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
-
- const ClTensorDescriptor t_lhs_desc{ &t_lhs_info };
- const ClTensorDescriptor t_rhs_desc{ &t_rhs_info };
- const ClTensorDescriptor t_bias_desc{ &t_bias_info };
- const ClTensorDescriptor t_addend_desc{ &t_dst_info };
- const ClTensorDescriptor t_dst_desc{ &t_dst_info };
-
- ClKernelBlueprint bp;
- ArgumentID tid_lhs;
- ArgumentID tid_rhs;
- ArgumentID tid_l0_bias = g_arg_placeholder;
- ArgumentID tid_l1_addend;
- ArgumentID tid_dst;
- auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
- st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
- st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
- st = add_tensor_argument(bp, t_dst_desc, tid_dst);
-
- const auto common_kernel_desc = ClKernelComponentDescriptor{};
- const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k };
- const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
- const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
- const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
-
- ArgumentID tid_acc;
- st = add_tensor_intermed(bp, tid_acc);
- st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
- st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
- st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
-
- ClKernelCode cl_code;
-
- st = set_tile_info(bp, store_tile_info);
- st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
-
- ClExecutionDescriptor exec_desc{};
- st = tune_static(exec_desc, cl_code);
-
- CLScheduler::get().default_reinit();
- ClCompositeKernel kernel;
- kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
-
- // Construct tensors
- CLTensor t_lhs{};
- CLTensor t_rhs{};
- CLTensor t_l1_addend{};
- CLTensor t_dst{};
- // Init tensors
- {
- t_lhs.allocator()->init(t_lhs_info);
- t_rhs.allocator()->init(t_rhs_info);
- t_l1_addend.allocator()->init(t_dst_info);
- t_dst.allocator()->init(t_dst_info);
- }
- // "Pack" tensors
- TensorBinding tensors({ { tid_lhs, &t_lhs },
- { tid_rhs, &t_rhs },
- { tid_l1_addend, &t_l1_addend },
- { tid_dst, &t_dst }
- });
- // Allocate and fill tensors
- {
- t_lhs.allocator()->allocate();
- t_rhs.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- t_dst.allocator()->allocate();
- fill<float>(CLAccessor(t_lhs), 0);
- fill<float>(CLAccessor(t_rhs), 1);
- fill<float>(CLAccessor(t_l1_addend), 2);
- }
-
- CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
-
- // Create reference
- SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
- SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
- SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
- SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
-
- // Fill reference
- fill<float>(ref_t_lhs, 0);
- fill<float>(ref_t_rhs, 1);
- fill<float>(ref_t_l1_addend, 2);
- const auto ref_t_dst = reference::arithmetic_operation(
- ArithmeticOperation::ADD,
- ref_t_l1_addend,
- reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
- data_type,
- eltwise_add_desc.convert_policy);
-
- RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
- validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32);
-}
-
TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
{
/* Computation:
@@ -208,7 +77,7 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
Status st{};
const auto data_type = DataType::F32;
- const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U);
+ const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ };
const auto width = 7U;
const auto height = 6U;
@@ -216,47 +85,44 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
const auto OFM = 4U;
const auto kernel_sz = 3U;
- const auto src_shape = TensorShape(IFM, width, height);
- const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM);
- const auto bia_shape = TensorShape(OFM);
- const auto dst_shape = TensorShape(OFM, width, height);
+ const auto src_shape = TensorShape(IFM, width, height);
+ const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM);
+ const auto bia_shape = TensorShape(OFM);
+ const auto addend_shape = TensorShape(1, 1);
+ const auto dst_shape = TensorShape(OFM, width, height);
- auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC);
- auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC);
- auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC);
- auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC);
-
- const auto src_desc = ClTensorDescriptor(&src_info);
- const auto wei_desc = ClTensorDescriptor(&wei_info);
- const auto bia_desc = ClTensorDescriptor(&bia_info);
- const auto addend_desc = ClTensorDescriptor(&dst_info);
- const auto dst_desc = ClTensorDescriptor(&dst_info);
+ auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC);
+ auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC);
+ auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC);
+ auto addend_info = TensorInfo(addend_shape, 1, data_type, DataLayout::NHWC);
+ auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC);
const auto n0 = std::min(OFM, 4u);
const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
- const ClKernelComponentDescriptor common_kernel_desc{};
- const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info };
- const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
- const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
+ const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info };
+ const ClEltwiseAddKernelDescriptor eltwise_add_desc{};
+ const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
ArgumentID src_id{ g_arg_placeholder };
ArgumentID wei_id{ g_arg_placeholder };
ArgumentID bia_id{ g_arg_placeholder };
ArgumentID acc_id{ g_arg_placeholder };
+ ArgumentID acc_1_id{ g_arg_placeholder };
ArgumentID addend_id{ g_arg_placeholder };
ArgumentID dst_id{ g_arg_placeholder };
- st = add_tensor_argument(bp, src_desc, src_id);
- st = add_tensor_argument(bp, wei_desc, wei_id);
- st = add_tensor_argument(bp, bia_desc, bia_id);
- st = add_tensor_intermed(bp, acc_id);
- st = add_tensor_argument(bp, addend_desc, addend_id);
- st = add_tensor_argument(bp, dst_desc, dst_id);
+ st = add_tensor(bp, &src_info, src_id);
+ st = add_tensor(bp, &wei_info, wei_id);
+ st = add_tensor(bp, &bia_info, bia_id);
+ st = add_tensor(bp, &dst_info, acc_id);
+ st = add_tensor(bp, &dst_info, acc_1_id);
+ st = add_tensor(bp, &addend_info, addend_id);
+ st = add_tensor(bp, &dst_info, dst_id);
- st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
- st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id);
- st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect);
+ st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
+ st = add_kcomp_eltwise_add(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id);
+ st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id);
exec_desc.skip_sliding_window = true;
@@ -282,12 +148,11 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
dst.allocator()->init(dst_info);
// "Pack" tensors
- TensorBinding tensors({ { src_id, &src },
+ ITensorPack tensors{ { src_id, &src },
{ wei_id, &wei },
{ bia_id, &bia },
{ addend_id, &addend },
- { dst_id, &dst }
- });
+ { dst_id, &dst } };
// Allocate and fill tensors
src.allocator()->allocate();
@@ -296,10 +161,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
addend.allocator()->allocate();
dst.allocator()->allocate();
- fill<float>(CLAccessor(src), 0);
- fill<float>(CLAccessor(wei), 1);
- fill<float>(CLAccessor(bia), 2);
- fill<float>(CLAccessor(addend), 3);
+ fill<float>(CLAccessor(src), 0, library.get());
+ fill<float>(CLAccessor(wei), 1, library.get());
+ fill<float>(CLAccessor(bia), 2, library.get());
+ fill<float>(CLAccessor(addend), 3, library.get());
CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
@@ -310,10 +175,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
SimpleTensor<float> ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
// Fill reference
- fill<float>(ref_src_nhwc, 0);
- fill<float>(ref_wei_nhwc, 1);
- fill<float>(ref_bia_nhwc, 2);
- fill<float>(ref_addend_nhwc, 3);
+ fill<float>(ref_src_nhwc, 0, library.get());
+ fill<float>(ref_wei_nhwc, 1, library.get());
+ fill<float>(ref_bia_nhwc, 2, library.get());
+ fill<float>(ref_addend_nhwc, 3, library.get());
auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U));
auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U));
@@ -326,301 +191,25 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
const auto ref_dst = reference::arithmetic_operation(
ArithmeticOperation::ADD,
ref_addend,
- reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info),
- data_type,
- eltwise_add_desc.convert_policy);
+ reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw,
+ PadStrideInfo
+ {
+ static_cast<unsigned int>(conv_info.stride.x()),
+ static_cast<unsigned int>(conv_info.stride.y()),
+ static_cast<unsigned int>(conv_info.pad.left),
+ static_cast<unsigned int>(conv_info.pad.top) }),
+ data_type,
+ ConvertPolicy::SATURATE);
RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
validate(CLAccessor(dst), ref_dst, tolerance_f32);
}
TEST_SUITE_END() // Validate
-
-TEST_SUITE(Benchmark)
-TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL)
-{
- using std::chrono::duration_cast;
- using std::chrono::microseconds;
- const int num_iterations = 200;
- std::map<std::string, std::chrono::microseconds> measurements;
- /* Computation:
- * out = add(addend, gemm_native(lhs, rhs, bias))
- */
- const auto data_type = DataType::F32;
- const auto m = 12U * 12U;
- const auto n = 64U;
- const auto k = 384U;
- const auto t_lhs_shape = TensorShape(k, m);
- const auto t_rhs_shape = TensorShape(n, k);
- const auto t_dst_shape = TensorShape(n, m);
- auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type);
- auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type);
- auto t_bias_info = TensorInfo(TensorShape(), 1, data_type);
- auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3
- auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type);
- auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type);
-
- const auto common_kernel_desc = ClKernelComponentDescriptor{};
- const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k };
- const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
- const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP };
- const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT };
-
- // Create reference
- SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 };
- SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 };
- SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 };
- SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 };
-
- // Fill reference
- fill<float>(ref_t_lhs, 0);
- fill<float>(ref_t_rhs, 1);
- fill<float>(ref_t_l1_addend, 2);
- const auto ref_t_dst = reference::arithmetic_operation(
- ArithmeticOperation::ADD,
- ref_t_l1_addend,
- reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */),
- data_type,
- eltwise_add_desc.convert_policy);
-
- CLScheduler::get().default_reinit();
-
- /* Condition 0: Dynamic Fused Kernel */
- CLTensor cond0_t_dst{};
- {
- TICK(cond0_0_startup_time);
-
- ClKernelBlueprint bp;
- ArgumentID tid_lhs;
- ArgumentID tid_rhs;
- ArgumentID tid_l0_bias = g_arg_placeholder;
- ArgumentID tid_l1_addend;
- ArgumentID tid_dst;
-
- const ClTensorDescriptor t_lhs_desc{ &t_lhs_info };
- const ClTensorDescriptor t_rhs_desc{ &t_rhs_info };
- const ClTensorDescriptor t_bias_desc{ &t_bias_info };
- const ClTensorDescriptor t_addend_desc{ &t_dst_info };
- const ClTensorDescriptor t_dst_desc{ &t_dst_info };
-
- ClKernelCode cl_code;
- TICK(cond0_build_time)
- auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs);
- st = add_tensor_argument(bp, t_rhs_desc, tid_rhs);
- st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend);
- st = add_tensor_argument(bp, t_dst_desc, tid_dst);
-
- ArgumentID tid_acc;
- st = add_tensor_intermed(bp, tid_acc);
- st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc);
-
- st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc);
-
- st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware);
-
- st = set_tile_info(bp, store_tile_info);
- st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
- TOCK(cond0_build_time, measurements)
-
- TICK(cond0_tune_time)
- ClExecutionDescriptor exec_desc{};
- st = tune_static(exec_desc, cl_code);
- TOCK(cond0_tune_time, measurements)
-
- TICK(cond0_configure_time)
- ClCompositeKernel kernel;
- kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
- TOCK(cond0_configure_time, measurements)
-
- // Construct tensors
- CLTensor t_lhs{};
- CLTensor t_rhs{};
- CLTensor t_l1_addend{};
-
- // Init tensors
- {
- t_lhs.allocator()->init(t_lhs_info);
- t_rhs.allocator()->init(t_rhs_info);
- t_l1_addend.allocator()->init(t_dst_info);
- cond0_t_dst.allocator()->init(t_dst_info);
- }
- // Allocate tensors
- {
- t_lhs.allocator()->allocate();
- t_rhs.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- cond0_t_dst.allocator()->allocate();
- fill<float>(CLAccessor(t_lhs), 0);
- fill<float>(CLAccessor(t_rhs), 1);
- fill<float>(CLAccessor(t_l1_addend), 2);
- }
-
- // "Pack" tensors
- TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } });
-
- CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
- CLScheduler::get().sync();
- TOCK(cond0_0_startup_time, measurements)
-
- TICK(cond0_1_latency)
- for(int i = 0; i < num_iterations; ++i)
- {
- CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
- }
- CLScheduler::get().sync();
- TOCK_AVG(cond0_1_latency, measurements, num_iterations)
- }
- /* Condition 1: Dynamic Unfused Kernel */
- /* Condition 2: Static Fused Kernel (current) */
- CLTensor cond2_t_dst{};
- {
- TICK(cond2_0_startup_time);
- arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
-
- TICK(cond2_configure_time);
- experimental::PostOpList<ITensorInfo *> post_ops;
- post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(&t_dst_info, 1, eltwise_add_desc.convert_policy);
- GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops };
- l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
- gemm_native_desc.rhs_info, gemm_info);
- TOCK(cond2_configure_time, measurements);
-
- // Construct tensors
- CLTensor t_lhs{};
- CLTensor t_rhs{};
- CLTensor t_l1_addend{};
-
- // Init tensors
- {
- t_lhs.allocator()->init(t_lhs_info);
- t_rhs.allocator()->init(t_rhs_info);
- t_l1_addend.allocator()->init(t_dst_info);
- cond2_t_dst.allocator()->init(t_dst_info);
- }
- // Allocate tensors
- {
- t_lhs.allocator()->allocate();
- t_rhs.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- cond2_t_dst.allocator()->allocate();
- fill<float>(CLAccessor(t_lhs), 0);
- fill<float>(CLAccessor(t_rhs), 1);
- fill<float>(CLAccessor(t_l1_addend), 2);
- }
-
- // "Pack" tensors
- ITensorPack tensors
- {
- { ACL_SRC_0, &t_lhs },
- { ACL_SRC_1, &t_rhs },
- { EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend },
- { ACL_DST, &cond2_t_dst },
- };
- CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
- CLScheduler::get().sync();
- TOCK(cond2_0_startup_time, measurements);
-
- TICK(cond2_1_latency);
- for(int i = 0; i < num_iterations; ++i)
- {
- CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true);
- }
- CLScheduler::get().sync();
- TOCK_AVG(cond2_1_latency, measurements, num_iterations);
- }
- /* Condition 3: Static Unfused Kernel (current) */
- CLTensor cond3_t_dst{};
- {
- TICK(cond3_0_startup_time);
- arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm;
- arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add;
-
- TICK(cond3_configure_time);
- GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 };
- l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info,
- gemm_native_desc.rhs_info, gemm_info);
- l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy);
- TOCK(cond3_configure_time, measurements);
-
- // Construct tensors
- CLTensor t_lhs{};
- CLTensor t_rhs{};
- CLTensor t_l0_dst{};
- CLTensor t_l1_addend{};
-
- // Init tensors
- {
- t_lhs.allocator()->init(t_lhs_info);
- t_rhs.allocator()->init(t_rhs_info);
- t_l0_dst.allocator()->init(t_l0_dst_info);
- t_l1_addend.allocator()->init(t_dst_info);
- cond3_t_dst.allocator()->init(t_dst_info);
- }
- // Allocate tensors
- {
- t_lhs.allocator()->allocate();
- t_rhs.allocator()->allocate();
- t_l0_dst.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- cond3_t_dst.allocator()->allocate();
- fill<float>(CLAccessor(t_lhs), 0);
- fill<float>(CLAccessor(t_rhs), 1);
- fill<float>(CLAccessor(t_l1_addend), 2);
- }
-
- // "Pack" tensors
- ITensorPack tensors_l0
- {
- { ACL_SRC_0, &t_lhs },
- { ACL_SRC_1, &t_rhs },
- { ACL_DST, &t_l0_dst },
- };
- ITensorPack tensors_l1
- {
- { ACL_SRC_0, &t_l0_dst },
- { ACL_SRC_1, &t_l1_addend },
- { ACL_DST, &cond3_t_dst },
- };
- CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
- CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
- CLScheduler::get().sync();
- TOCK(cond3_0_startup_time, measurements);
-
- TICK(cond3_1_latency);
- for(int i = 0; i < num_iterations; ++i)
- {
- CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true);
- CLScheduler::get().enqueue_op(l1_add, tensors_l1, true);
- }
- CLScheduler::get().sync();
- TOCK_AVG(cond3_1_latency, measurements, num_iterations);
- }
-
- RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
- std::cout << "cond0 validation: " << std::endl;
- validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32);
- std::cout << "cond2 validation: " << std::endl;
- validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32);
- std::cout << "cond3 validation: " << std::endl;
- validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32);
-
- /* Report */
- std::cout << "Performance comparison (gemm native + add)" << std::endl;
- std::cout << "cond0: dynamic fusion module" << std::endl;
- std::cout << "cond2: static fused with post ops" << std::endl;
- std::cout << "cond3: static unfused" << std::endl;
- for(auto m : measurements)
- {
- std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
- }
-}
-TEST_SUITE_END() // Benchmark
TEST_SUITE_END() // ClCompositeKernel
TEST_SUITE_END() // DYNAMIC_FUSION
TEST_SUITE_END() // UNIT
TEST_SUITE_END() // CL
} // namespace validation
} // namespace test
-} // namespace arm_compute
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp
new file mode 100644
index 0000000000..6962f0e6d1
--- /dev/null
+++ b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/experimental/DependencyGraph.h"
+
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+
+using namespace arm_compute::experimental::dynamic_fusion;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+
+TEST_SUITE(UNIT)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_SUITE(DependencyGraph)
+
+TEST_CASE(Correct_Graph_Creation_Should_Pass, framework::DatasetMode::ALL)
+{
+ DependencyGraph graph{};
+ const auto t0 = graph.add_tensor();
+ const auto t1 = graph.add_tensor();
+ const auto t2 = graph.add_tensor();
+ const auto t3 = graph.add_tensor();
+ const auto t4 = graph.add_tensor();
+
+ const auto o0 = graph.add_operator({ t0, t1 }, { t2 }).second;
+ const auto o1 = graph.add_operator({ t3, t2 }, { t4 }).second;
+
+ ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_ops(), 2U, framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_tensors(), 5U, framework::LogLevel::ERRORS);
+
+ const DependencyGraph ref_graph
+ {
+ {
+ // src_tensors
+ { o0, { t0, t1 } },
+ { o1, { t3, t2 } },
+ },
+ {
+ // dst_tensors
+ { o0, { t2 } },
+ { o1, { t4 } },
+ },
+ {
+ // src_ops
+ { t0, {} },
+ { t1, {} },
+ { t2, { o0 } },
+ { t3, {} },
+ { t4, { o1 } },
+ },
+ {
+ // dst_ops
+ { t0, { o0 } },
+ { t1, { o0 } },
+ { t2, { o1 } },
+ { t3, { o1 } },
+ { t4, {} },
+ }
+
+ };
+ ARM_COMPUTE_EXPECT(graph == ref_graph, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(Correct_Merge_Points_Should_Enable_Graph_Expansion, framework::DatasetMode::ALL)
+{
+ // Merge points are a simple way to collapse "graph of graphs" into a single graph
+ // Suppose we have a top-level graph g0
+ DependencyGraph g0{};
+ const auto g0_t0 = g0.add_tensor();
+ const auto g0_t1 = g0.add_tensor();
+ const auto g0_t2 = g0.add_tensor();
+ const auto g0_t3 = g0.add_tensor();
+ const auto g0_t4 = g0.add_tensor();
+ g0.add_operator({ g0_t0, g0_t1 }, { g0_t2 }); // g0_o0
+ g0.add_operator({ g0_t3, g0_t2 }, { g0_t4 }); // g0_o1
+
+ // Then g0 expands into g1, with additional nodes added in-between "merge point tensors"
+ // Note that the expansion logic may be local to each operator node
+ DependencyGraph g1{};
+ // g0_o0 expands into g1_o0, g1_o1, g1_o2
+ const auto g1_t0 = g1.add_tensor(g0_t0);
+ const auto g1_t1 = g1.add_tensor(g0_t1);
+ const auto g1_t2 = g1.add_tensor();
+ const auto g1_t3 = g1.add_tensor();
+ const auto g1_t4 = g1.add_tensor(g0_t2);
+ const auto g1_o0 = g1.add_operator({ g1_t0 }, { g1_t2 }).second;
+ const auto g1_o1 = g1.add_operator({ g1_t1 }, { g1_t3 }).second;
+ const auto g1_o2 = g1.add_operator({ g1_t2, g1_t3 }, { g1_t4 }).second;
+
+ // g0_o1 expands into g1_o3
+ const auto g1_t5 = g1.add_tensor(g0_t3);
+ const auto g1_t6 = g1.add_tensor(g0_t2);
+ const auto g1_t7 = g1.add_tensor(g0_t4);
+ ARM_COMPUTE_EXPECT_EQUAL(g1_t4, g1_t6, framework::LogLevel::ERRORS); // both associate with the same merge point g0_t2, thus they should point to the same tensor in g1
+ const auto g1_o3 = g1.add_operator({ g1_t5, g1_t6 }, { g1_t7 }).second;
+
+ const DependencyGraph ref_graph
+ {
+ {
+ // src_tensors
+ { g1_o0, { g1_t0 } },
+ { g1_o1, { g1_t1 } },
+ { g1_o2, { g1_t2, g1_t3 } },
+ { g1_o3, { g1_t5, g1_t4 } },
+ },
+ {
+ // dst_tensors
+ { g1_o0, { g1_t2 } },
+ { g1_o1, { g1_t3 } },
+ { g1_o2, { g1_t4 } },
+ { g1_o3, { g1_t7 } },
+ },
+ {
+ // src_ops
+ { g1_t0, {} },
+ { g1_t1, {} },
+ { g1_t2, { g1_o0 } },
+ { g1_t3, { g1_o1 } },
+ { g1_t4, { g1_o2 } },
+ { g1_t5, {} },
+ { g1_t7, { g1_o3 } },
+ },
+ {
+ // dst_ops
+ { g1_t0, { g1_o0 } },
+ { g1_t1, { g1_o1 } },
+ { g1_t2, { g1_o2 } },
+ { g1_t3, { g1_o2 } },
+ { g1_t4, { g1_o3 } },
+ { g1_t5, { g1_o3 } },
+ { g1_t7, {} },
+ },
+ {
+ // merge points
+ { g0_t0, g1_t0 },
+ { g0_t1, g1_t1 },
+ { g0_t2, g1_t4 },
+ { g0_t3, g1_t5 },
+ { g0_t4, g1_t7 },
+ }
+ };
+ ARM_COMPUTE_EXPECT(g1 == ref_graph, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(Path_Existence_Check_0, framework::DatasetMode::ALL)
+{
+ DependencyGraph graph{};
+ const auto t0 = graph.add_tensor();
+ const auto t1 = graph.add_tensor();
+ const auto t2 = graph.add_tensor();
+ const auto t3 = graph.add_tensor();
+ const auto t4 = graph.add_tensor();
+ const auto t5 = graph.add_tensor();
+ const auto t6 = graph.add_tensor();
+ const auto t7 = graph.add_tensor();
+ const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second;
+ const auto o1 = graph.add_operator({ t3 }, { t5 }).second;
+ const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second;
+ const auto o3 = graph.add_operator({ t4 }, { t6 }).second;
+ const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second;
+
+ ARM_COMPUTE_UNUSED(o1, o3);
+
+ ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t3, o2)), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t1, o4)), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t2, o4)), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t0, o2)), framework::LogLevel::ERRORS);
+
+ ARM_COMPUTE_EXPECT((graph.path_exists_from_op_to_op(o0, o2)), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o0)), framework::LogLevel::ERRORS);
+
+ ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o4)), framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(Correct_Topological_Sort_Should_Pass, framework::DatasetMode::ALL)
+{
+ DependencyGraph graph{};
+ const auto t0 = graph.add_tensor();
+ const auto t1 = graph.add_tensor();
+ const auto t2 = graph.add_tensor();
+ const auto t3 = graph.add_tensor();
+ const auto t4 = graph.add_tensor();
+ const auto t5 = graph.add_tensor();
+ const auto t6 = graph.add_tensor();
+ const auto t7 = graph.add_tensor();
+ const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second;
+ const auto o1 = graph.add_operator({ t3 }, { t5 }).second;
+ const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second;
+ const auto o3 = graph.add_operator({ t4 }, { t6 }).second;
+ const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second;
+
+ const auto res = graph.topological_sort();
+ ARM_COMPUTE_EXPECT(bool(res.first), framework::LogLevel::ERRORS);
+ std::vector<DependencyGraph::OpPack> ref_sorted_op_packs
+ {
+ { o0, { t1 }, { t3, t4 } },
+ { o1, { t3 }, { t5 } },
+ { o3, { t4 }, { t6 } },
+ { o4, { t0, t5 }, { t2 } },
+ { o2, { t5, t6 }, { t7 } },
+
+ };
+ ARM_COMPUTE_EXPECT((res.second == ref_sorted_op_packs), framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(Cycles_Should_Fail, framework::DatasetMode::ALL)
+{
+ DependencyGraph graph{};
+ const auto t0 = graph.add_tensor();
+ const auto t1 = graph.add_tensor();
+ const auto t2 = graph.add_tensor();
+ const auto t3 = graph.add_tensor();
+
+ graph.add_operator({ t0, t1 }, { t2 });
+ graph.add_operator({ t2 }, { t1, t3 }); // Ideally error should occur here
+
+ const auto res = graph.topological_sort();
+ ARM_COMPUTE_EXPECT(!bool(res.first), framework::LogLevel::ERRORS);
+}
+TEST_CASE(Loops_Should_Fail, framework::DatasetMode::ALL)
+{
+ DependencyGraph graph{};
+ const auto t0 = graph.add_tensor();
+ const auto t1 = graph.add_tensor();
+ const auto t2 = graph.add_tensor();
+
+ ARM_COMPUTE_EXPECT_THROW(graph.add_operator({ t0, t2 }, { t1, t2 }).first, framework::LogLevel::ERRORS);
+ ARM_COMPUTE_UNUSED(t0, t1, t2);
+}
+TEST_SUITE_END() // DependencyGraph
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // UNIT
+
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
new file mode 100644
index 0000000000..1b04b0cee0
--- /dev/null
+++ b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#include "arm_compute/core/TensorInfo.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
+#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Operator_Fuse_Movenet_SubGraph_1_F32, framework::DatasetMode::ALL)
+{
+ // Please refer to: https://confluence.arm.com/pages/viewpage.action?pageId=886243697
+ /* Computation:
+ * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+ const auto t_input_shape = TensorShape(384, 12, 12);
+ // const auto t_weight_shape = TensorShape(384, 1, 1, 64);
+ // const auto t_dst_shape = TensorShape(64, 12, 12);
+ const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+ const auto t_dst_shape = TensorShape(16, 12, 12);
+ auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
+ auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+ auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+ auto t_acc_info = TensorInfo(); // Intermediate tensor for cond3
+ auto t_dst_info = TensorInfo();
+
+ Conv2dDescriptor conv2d_desc{};
+ AddDescriptor add_desc{};
+
+ // Create reference
+ SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+ SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+ // Fill reference
+ fill<float>(ref_t_input, 0, library.get());
+ fill<float>(ref_t_weight, 1, library.get());
+ fill<float>(ref_t_l1_addend, 2, library.get());
+
+ auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+ auto ref_t_weight_nchw = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+ auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+ auto ref_t_l1_addend_nchw = reference::permute(ref_t_l1_addend, PermutationVector(1U, 2U, 0U));
+ auto t_dst_shape_nchw = t_dst_shape;
+ permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+ PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{});
+ auto ref_t_dst_nchw = reference::arithmetic_operation(
+ ArithmeticOperation::ADD,
+ ref_t_l1_addend_nchw,
+ reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation),
+ data_type,
+ ConvertPolicy{});
+ const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+ CLScheduler::get().default_reinit();
+ const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+ OperatorGraph op_graph;
+
+ const auto op_t_input = add_tensor(op_graph, t_input_info);
+ const auto op_t_weight = add_tensor(op_graph, t_weight_info);
+ const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info);
+ const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+
+ auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
+ force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
+ add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ build(workload, op_graph, workload_ctx);
+
+ ClCompositeOperator op;
+ op.configure(cl_compile_ctx, workload);
+
+ // Construct tensors
+ CLTensor t_input{};
+ CLTensor t_weight{};
+ CLTensor t_l1_addend{};
+ CLTensor t_dst{};
+
+ // Init tensors
+ t_input.allocator()->init(t_input_info);
+ t_weight.allocator()->init(t_weight_info);
+ t_l1_addend.allocator()->init(t_dst_info);
+ t_dst.allocator()->init(t_dst_info);
+
+ // Allocate and fill tensors
+ t_input.allocator()->allocate();
+ t_weight.allocator()->allocate();
+ t_l1_addend.allocator()->allocate();
+ t_dst.allocator()->allocate();
+ fill<float>(CLAccessor(t_input), 0, library.get());
+ fill<float>(CLAccessor(t_weight), 1, library.get());
+ fill<float>(CLAccessor(t_l1_addend), 2, library.get());
+ // "Pack" tensors
+ OpTensorBinding bp_tensors({ { op_t_input, &t_input },
+ { op_t_weight, &t_weight },
+ { op_t_l1_addend, &t_l1_addend },
+ { op_t_dst, &t_dst }
+ });
+
+ // Populate prepare and run pack-maps (including allocating aux tensors)
+ ClAuxTensorData aux_tensor_data{};
+ TensorPackMap prepare_pack_map{};
+ TensorPackMap run_pack_map{};
+ bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
+
+ op.prepare(prepare_pack_map);
+ op.run(run_pack_map);
+ RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+ validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+TEST_SUITE(Unsupported)
+TEST_CASE(DataType_QASYMM8, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::QASYMM8;
+ const auto data_layout = DataLayout::NHWC;
+ const auto t_input_shape = TensorShape(384, 12, 12);
+ const auto t_weight_shape = TensorShape(384, 1, 1, 64);
+ const auto t_dst_shape = TensorShape(64, 12, 12);
+ auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
+ auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+ auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+ auto t_acc_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+ auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+
+ Conv2dDescriptor conv2d_desc{};
+ AddDescriptor add_desc{};
+
+ OperatorGraph op_graph;
+
+ const auto op_t_input = add_tensor(op_graph, t_input_info);
+ const auto op_t_weight = add_tensor(op_graph, t_weight_info);
+ const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info);
+ const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+
+ auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
+ add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+ force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
+
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ const auto success = build(workload, op_graph, workload_ctx);
+
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
+}
+TEST_CASE(DataLayout_NCHW, framework::DatasetMode::ALL)
+{
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NCHW;
+ const auto t_input_shape = TensorShape(384, 12, 12);
+ const auto t_weight_shape = TensorShape(384, 1, 1, 64);
+ const auto t_dst_shape = TensorShape(64, 12, 12);
+ auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
+ auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+ auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+
+ Conv2dDescriptor conv2d_desc{};
+
+ OperatorGraph op_graph;
+
+ const auto op_t_input = add_tensor(op_graph, t_input_info);
+ const auto op_t_weight = add_tensor(op_graph, t_weight_info);
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+
+ auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_dst);
+ force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ const auto success = build(workload, op_graph, workload_ctx);
+
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Unsupported
+
+TEST_SUITE(Invalid)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+ const auto t_l0_input_shape = TensorShape(1024, 56, 56);
+ const auto t_l0_weight_shape = TensorShape(512, 1024, 1, 1);
+ const auto t_l1_weight_shape = TensorShape(512, 256, 1, 1);
+
+ auto t_l0_input_info = TensorInfo(t_l0_input_shape, 1, data_type, data_layout);
+ auto t_l0_weight_info = TensorInfo(t_l0_weight_shape, 1, data_type, data_layout);
+ auto t_l1_weight_info = TensorInfo(t_l1_weight_shape, 1, data_type, data_layout);
+ auto t_l0_dst_info = TensorInfo();
+ auto t_dst_info = TensorInfo();
+
+ OperatorGraph op_graph;
+ const auto conv2d_desc = Conv2dDescriptor{};
+
+ const auto op_t_l0_input = add_tensor(op_graph, t_l0_input_info);
+ const auto op_t_l0_weight = add_tensor(op_graph, t_l0_weight_info);
+ const auto op_t_l1_weight = add_tensor(op_graph, t_l1_weight_info);
+ const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+
+ add_op_conv2d(op_graph, conv2d_desc, op_t_l0_input, op_t_l0_weight, op_t_l0_dst);
+ add_op_conv2d(op_graph, conv2d_desc, op_t_l0_dst, op_t_l1_weight, op_t_dst);
+
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ const auto success = build(workload, op_graph, workload_ctx);
+
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
+}
+TEST_CASE(Enlarging_Execution_Space, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out = add(l2_lhs, add(add(l0_lhs, l0_rhs), l1_rhs))
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+ const auto t_l0_lhs_shape = TensorShape(1, 256, 3);
+ const auto t_l0_rhs_shape = TensorShape(1, 256, 3);
+ const auto t_l1_rhs_shape = TensorShape(1, 1, 3);
+ const auto t_l2_lhs_shape = TensorShape(1024, 1, 3);
+
+ auto t_l0_lhs_info = TensorInfo(t_l0_lhs_shape, 1, data_type, data_layout);
+ auto t_l0_rhs_info = TensorInfo(t_l0_rhs_shape, 1, data_type, data_layout);
+ auto t_l1_rhs_info = TensorInfo(t_l1_rhs_shape, 1, data_type, data_layout);
+ auto t_l2_lhs_info = TensorInfo(t_l2_lhs_shape, 1, data_type, data_layout);
+ auto t_l0_dst_info = TensorInfo();
+ auto t_l1_dst_info = TensorInfo();
+ auto t_dst_info = TensorInfo();
+
+ OperatorGraph op_graph;
+ const auto add_desc = AddDescriptor{};
+
+ const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
+ const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info);
+ const auto op_t_l1_rhs = add_tensor(op_graph, t_l1_rhs_info);
+ const auto op_t_l2_lhs = add_tensor(op_graph, t_l2_lhs_info);
+ const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+
+ add_op_elementwise_add(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst);
+ add_op_elementwise_add(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst);
+ add_op_elementwise_add(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst);
+
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ const auto success = build(workload, op_graph, workload_ctx);
+
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
+}
+TEST_CASE(Root_Simple_And_Complex, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * out = add(conv(l0_0_input, l0_0_weight), add(l0_1_lhs, l0_1_rhs))
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+
+ const auto t_l0_0_input_shape = TensorShape(128, 21, 21);
+ const auto t_l0_0_weight_shape = TensorShape(144, 128, 1, 1);
+ const auto t_l0_1_lhs_shape = TensorShape(144, 21, 21);
+ const auto t_l0_1_rhs_shape = TensorShape(1, 1, 21);
+
+ auto t_l0_0_input_info = TensorInfo(t_l0_0_input_shape, 1, data_type, data_layout);
+ auto t_l0_0_weight_info = TensorInfo(t_l0_0_weight_shape, 1, data_type, data_layout);
+ auto t_l0_1_lhs_info = TensorInfo(t_l0_1_lhs_shape, 1, data_type, data_layout);
+ auto t_l0_1_rhs_info = TensorInfo(t_l0_1_rhs_shape, 1, data_type, data_layout);
+ auto t_l0_0_dst_info = TensorInfo();
+ auto t_l0_1_dst_info = TensorInfo();
+ auto t_dst_info = TensorInfo();
+
+ OperatorGraph op_graph;
+ const auto conv2d_desc = Conv2dDescriptor{};
+ const auto add_desc = AddDescriptor{};
+
+ const auto op_t_l0_0_input = add_tensor(op_graph, t_l0_0_input_info);
+ const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info);
+ const auto op_t_l0_1_lhs = add_tensor(op_graph, t_l0_1_lhs_info);
+ const auto op_t_l0_1_rhs = add_tensor(op_graph, t_l0_1_rhs_info);
+ const auto op_t_l0_0_dst = add_tensor(op_graph, t_l0_0_dst_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_l0_1_dst = add_tensor(op_graph, t_l0_1_dst_info); // temp accumulator; TensorInfo to be inferred
+ const auto op_t_dst = add_tensor(op_graph, t_dst_info);
+
+ add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst);
+ add_op_elementwise_add(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst);
+ add_op_elementwise_add(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst);
+
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ const auto success = build(workload, op_graph, workload_ctx);
+
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
+}
+TEST_CASE(Loop, framework::DatasetMode::ALL)
+{
+ /* Computation:
+ * tensor state0;
+ * state1 = add(l0_lhs, state0)
+ * state0 = add(l1_lhs, state1)
+ */
+ const auto data_type = DataType::F32;
+ const auto data_layout = DataLayout::NHWC;
+
+ const auto t_shape = TensorShape(13, 21);
+
+ auto t_l0_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout);
+ auto t_l1_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout);
+ auto state0_info = TensorInfo(t_shape, 1, data_type, data_layout);
+ auto state1_info = TensorInfo();
+
+ OperatorGraph op_graph;
+ const auto conv2d_desc = Conv2dDescriptor{};
+ const auto add_desc = AddDescriptor{};
+
+ const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
+ const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info);
+ const auto op_t_state0 = add_tensor(op_graph, state0_info);
+ const auto op_t_state1 = add_tensor(op_graph, state1_info);
+
+ add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1);
+ add_op_elementwise_add(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0);
+
+ const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
+ ClWorkload workload;
+ const auto success = build(workload, op_graph, workload_ctx);
+
+ ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+ ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Invalid
+
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h
new file mode 100644
index 0000000000..4512305c1e
--- /dev/null
+++ b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS
+#define TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS
+
+#include "tests/AssetsLibrary.h"
+#include "utils/Utils.h"
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace utils
+{
+/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name
+ *
+ */
+#define TICK(clock_name) \
+ auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map) \
+ auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
+ measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
+ auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
+ measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+template <typename T, typename U>
+void fill(U &&tensor, int seed, AssetsLibrary *library)
+{
+ static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+ using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+ DistributionType distribution{ T(-1.0f), T(1.0f) };
+ library->fill(tensor, distribution, seed);
+
+ // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+ DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+ library->fill_borders_with_garbage(tensor, distribution_inf, seed);
+}
+} // namespace utils
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif //TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS \ No newline at end of file