From b63b1196adea8b07dd8db77c2492a212650deba0 Mon Sep 17 00:00:00 2001
From: SiCong Li <sicong.li@arm.com>
Date: Fri, 28 Jan 2022 18:24:39 +0000
Subject: Integrate Dynamic Fusion patches

* Add public interfaces:
    * OperatorGraph: Describe a workload that could contain fused kernels
    * IWorkload: Generic interface for workloads built from OperatorGraph
    * ClWorkload: OpenCL workloads built from OperatorGraph
    * ClCompositeOperator: Runtime async operator to execute a ClWorkload
    * DependencyGraph (will likely be deprecated in later iterations)

* Add example
    * cl_fused_conv2d_elementwise_add.cpp to explain how to use the new
      interfaces

* Add internal translation layer

* Refactor ClKernelBuildingAPI
    * Remove non-tile based gemm native kernel component
    * Minor interface changes

* Add integration tests

Resolves COMPMID-5161

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 arm_compute/core/TensorInfo.h                   |  18 +-
 arm_compute/core/Types.h                        |  17 +-
 arm_compute/core/Window.h                       |  21 +-
 arm_compute/core/Window.inl                     |   7 +-
 arm_compute/core/experimental/ClWorkload.h      | 220 +++++++++++++++++++
 arm_compute/core/experimental/DependencyGraph.h | 278 ++++++++++++++++++++++++
 arm_compute/core/experimental/IWorkload.h       | 133 ++++++++++++
 arm_compute/core/experimental/OperatorGraph.h   | 211 ++++++++++++++++++
 arm_compute/core/experimental/Types.h           |  28 +--
 9 files changed, 915 insertions(+), 18 deletions(-)
 create mode 100644 arm_compute/core/experimental/ClWorkload.h
 create mode 100644 arm_compute/core/experimental/DependencyGraph.h
 create mode 100644 arm_compute/core/experimental/IWorkload.h
 create mode 100644 arm_compute/core/experimental/OperatorGraph.h

(limited to 'arm_compute/core')

diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 9bc86806fb..40f9ed9806 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -297,6 +297,7 @@ public:
         _are_values_constant = are_values_constant;
         return *this;
     }
+    inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
@@ -320,5 +321,20 @@ private:
     DataLayout       _data_layout;
     bool             _are_values_constant;
 };
+
+/** Check whether two tensor info are equal.
+ *
+ * @param[in] lhs LHS tensor info.
+ * @param[in] rhs RHS tensor info.
+ *
+ * @return True if the given tensor infos are the same.
+ */
+inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
+{
+    return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes)
+           && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format)
+           && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info)
+           && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant);
+}
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 1548816e91..7ae6a7e67e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -253,9 +253,22 @@ struct ValidRegion
         return *this;
     }
 
+    /** Check whether two valid regions are equal.
+     *
+     * @param[in] lhs LHS valid region
+     * @param[in] rhs RHS valid region
+     *
+     * @return True if the valid regions are the same.
+     */
+    inline friend bool operator==(const ValidRegion &lhs, const ValidRegion &rhs);
+
     Coordinates anchor; /**< Anchor for the start of the valid region. */
     TensorShape shape;  /**< Shape of the valid region. */
 };
+inline bool operator==(const ValidRegion &lhs, const ValidRegion &rhs)
+{
+    return (lhs.anchor == rhs.anchor) && (lhs.shape == rhs.shape);
+}
 
 /** Methods available to handle borders */
 enum class BorderMode
@@ -346,7 +359,7 @@ struct BorderSize
      *
      * @return true if they are equal
      */
-    bool operator==(const BorderSize &rhs)
+    bool operator==(const BorderSize &rhs) const
     {
         return (top == rhs.top) && (right == rhs.right) && (bottom == rhs.bottom) && (left == rhs.left);
     }
@@ -357,7 +370,7 @@ struct BorderSize
      *
      * @return true if they are different
      */
-    bool operator!=(const BorderSize &rhs)
+    bool operator!=(const BorderSize &rhs) const
     {
         return !(*this == rhs);
     }
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index f603e6c148..c566cffa88 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,6 +123,17 @@ public:
         {
             _end = end;
         }
+        /** Check whether two Dimensions are equal.
+         *
+         * @param[in] lhs LHS Dimensions
+         * @param[in] rhs RHS Dimensions
+         *
+         * @return True if the Dimensions are the same.
+         */
+        friend bool operator==(const Dimension &lhs, const Dimension &rhs)
+        {
+            return (lhs._start == rhs._start) && (lhs._end == rhs._end) && (lhs._step == rhs._step);
+        }
 
     private:
         int _start; /**< Start of the dimension */
@@ -414,6 +425,14 @@ public:
      * @param[in] rhs Second window to swap.
      */
     friend void swap(Window &lhs, Window &rhs);
+    /** Check whether two Windows are equal.
+     *
+     * @param[in] lhs LHS window
+     * @param[in] rhs RHS window
+     *
+     * @return True if the given windows are the same.
+     */
+    friend bool operator==(const Window &lhs, const Window &rhs);
 
 private:
     /** First slice of the window
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 6100d09a1c..5ee4b57145 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -305,4 +305,9 @@ inline void swap(Window &lhs, Window &rhs)
 {
     lhs._dims.swap(rhs._dims);
 }
+
+inline bool operator==(const Window &lhs, const Window &rhs)
+{
+    return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted);
+}
 } // namespace arm_compute
diff --git a/arm_compute/core/experimental/ClWorkload.h b/arm_compute/core/experimental/ClWorkload.h
new file mode 100644
index 0000000000..bcac08b9f7
--- /dev/null
+++ b/arm_compute/core/experimental/ClWorkload.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/experimental/IWorkload.h"
+#include "arm_compute/core/experimental/OperatorGraph.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Verbose and explicit way to enumerate all the tensor arguments variants used by
+ *  all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
+ */
+enum class ClKernelTensorArgType : int
+{
+    Scalar,
+
+    Vector,
+
+    Image,
+    Image_Reinterpret_As_3D,
+    Image_Export_To_ClImage2D,
+
+    Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+    Image_3D_Export_To_ClImage2D,
+
+    Tensor_3D,
+    Tensor_4D,
+    Tensor_4D_t_Buffer,
+    Tensor_4D_t_Image
+};
+
+/** Describes all the info required to add a kernel argument at run time
+ *
+ *  @note This struct can later be expanded into a more concise and formal way to specify how to set up
+ *  arguments for a kernel inside a @ref ClUnitWorkload
+ */
+struct ClKernelArgDescriptor
+{
+    ClKernelArgDescriptor() = default;
+    ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
+        : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
+    {
+    }
+    ~ClKernelArgDescriptor() = default;
+    friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
+    {
+        return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
+    }
+    int                   arg_id{ -1 };                                    /**< Arg ID in the blueprint, -1 means empty / uninitialized */
+    ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
+    bool                  slide_along_dimz{ true };                        /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
+};
+
+using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;
+
+/** Descriptor containing information required to run a single ClWorkload
+ */
+struct ClExecutionDescriptor
+{
+    cl::NDRange suggested_lws{};              /**< Suggested local work-group size for optimal performance if not zero */
+    cl::NDRange gws{};                        /**< Global work-group to be used */
+    bool        skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
+};
+
+/** Contains kernel code to be compiled and run in a ClUnitWorkload
+ */
+struct ClKernelCode
+{
+    friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
+    {
+        return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
+               && (code0.arguments == code1.arguments);
+    }
+    std::string     name{};          /**< Kernel name */
+    std::string     code{};          /**< Kernel source code */
+    std::string     config_id{};     /**< Generated from blueprint based on complex component */
+    CLBuildOptions  build_options{}; /**< Kernel build options */
+    Window          window{};        /**< Execution window */
+    ClKernelArgList arguments{};     /**< Kernel argument descriptors. map key is kernel ArgumentID */
+};
+
+/** A descriptor of ClWorkload Tensors.
+ */
+struct ClWorkloadTensor : public WorkloadTensor
+{
+    ClWorkloadTensor() = default;
+    ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
+        : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
+    {
+    }
+    ClKernelArgDescriptor kernel_arg{};
+    friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
+    {
+        return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
+    }
+};
+
+/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
+ */
+struct ClUnitWorkload : public UnitWorkload
+{
+    ClUnitWorkload() = default;
+    ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
+        : UnitWorkload{ id, stage }, code{ code }
+    {
+    }
+    friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
+    {
+        return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
+    }
+    ClKernelCode code{};
+};
+
+/** GPU information for @ref ClWorkloadContext
+ */
+struct GpuInfo
+{
+    friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
+    {
+        return info0.target == info1.target;
+    }
+    GPUTarget target{ GPUTarget::UNKNOWN };
+};
+
+/** Context (device capabilities, platform details) associated with a ClWorkload
+ *
+ * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
+ */
+struct ClWorkloadContext
+{
+    friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
+    {
+        return ctx0.gpu_info == ctx1.gpu_info;
+    }
+    GpuInfo gpu_info{};
+};
+
+/** Workload for Cl backend
+ */
+struct ClWorkload : public IWorkload
+{
+    Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
+    {
+        Tid id = graph.add_tensor(merge_point);
+        if(tensors.find(id) == tensors.end())
+        {
+            tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
+        }
+        return id;
+    }
+    UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
+    {
+        auto op            = graph.add_operator(inputs, outputs);
+        auto id            = op.second;
+        unit_workloads[id] = ClUnitWorkload(id, stage, code);
+        return id;
+    }
+    friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
+    {
+        return std::make_tuple(
+                   workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
+               == std::make_tuple(
+                   workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
+    }
+    ClWorkloadContext context{};                             /**< Workload context*/
+    std::map<UnitWorkId, ClUnitWorkload> unit_workloads{};   /**< Unit workloads to run*/
+    std::map<Tid, ClWorkloadTensor>      tensors{};          /**< Workload tensors*/
+    std::map<Tid, OpTensor::Id>          op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
+    Status status{};                                         /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
+};
+
+/** Build a @ref ClWorkload from an @ref OperatorGraph.
+ *
+ * @param[out] workload
+ * @param[in] op_graph
+ * @param[in] ctx
+ * @return Status
+ */
+Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
\ No newline at end of file
diff --git a/arm_compute/core/experimental/DependencyGraph.h b/arm_compute/core/experimental/DependencyGraph.h
new file mode 100644
index 0000000000..794bf0e344
--- /dev/null
+++ b/arm_compute/core/experimental/DependencyGraph.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
+
+#include "arm_compute/core/Error.h"
+
+#include <algorithm>
+#include <map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+template <typename T>
+bool is_in(const T &v, const std::vector<T> &vec)
+{
+    return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
+}
+
+/** The dependency graph of a workload, where the nodes are of 2 types: Tensor or Operator
+ *  Represented as a doubly-linked adjacency list with the differentiation between source and destination
+ *
+ * A "Merge Tensor" is an external tensor associated with the tensor within the graph, and serve as a merge point
+ */
+class DependencyGraph
+{
+public:
+    /** A serial Id allocator
+     *
+     */
+    class SerialIdAllocator
+    {
+    public:
+        using Id = int;
+        Id alloc()
+        {
+            return _counter++;
+        }
+        constexpr static Id empty()
+        {
+            return -1;
+        }
+
+    private:
+        Id _counter{ 0 };
+    };
+    using Id = SerialIdAllocator::Id;
+    /** Adjacency list
+     *
+     */
+    using AdjList = std::map<Id, std::vector<Id>>;
+
+    /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
+     *
+     */
+    struct OpPack
+    {
+        Id              op{};
+        std::vector<Id> inputs{};
+        std::vector<Id> outputs{};
+        friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+        {
+            return std::make_tuple(
+                       opp0.op, opp0.inputs, opp0.outputs)
+                   == std::make_tuple(
+                       opp1.op, opp1.inputs, opp1.outputs);
+        }
+    };
+
+public:
+    constexpr static Id empty_id()
+    {
+        return SerialIdAllocator::empty();
+    }
+
+    DependencyGraph() = default;
+    // Used in cases where two DependencyGraphs may want to share the same configuration of tensors
+    explicit DependencyGraph(const std::vector<Id> &imported_tensors);
+    // Testing only
+    DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points = {});
+
+    /** Add a new tensor
+     *
+     * @param merge_tensor The external merge point associated with the tensor. Leave empty if not needed.
+     * @return Id  The newly allocated tensor, or a previously added tensor associated with @p merge_tensor
+     */
+    Id add_tensor(Id merge_tensor = empty_id());
+
+    void remove_tensor(Id tensor);
+
+    /** Add a new operator
+     *
+     * @param inputs  Input tensors to the operator
+     * @param outputs  Output tensors to the operator
+     * @return std::pair<Status, DependencyGraph::Id> where id is the newly allocated operator
+     */
+    std::pair<Status, DependencyGraph::Id> add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs);
+
+    void remove_operator(Id op);
+    /** Sort the graph in a topological order
+     *
+     * @return std::pair<Status, std::vector<OpPack>>
+     */
+    std::pair<Status, std::vector<OpPack>> topological_sort() const;
+
+    std::vector<Id> src_ops(Id op) const;
+    std::vector<Id> dst_ops(Id op) const;
+
+    std::vector<Id> src_ops_from_tensor(Id tensor) const;
+    std::vector<Id> dst_ops_from_tensor(Id tensor) const;
+    /** Get the merge points object
+     *
+     * @return std::map<Id, Id>
+     */
+    std::map<Id, Id> get_merge_points() const;
+    /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
+     *
+     * @return std::vector<Id>
+     */
+    std::vector<Id> get_root_ops() const;
+    /** Get all dst ops of the whole graph
+     *
+     * @return std::vector<Id>
+     */
+    std::vector<Id> get_dst_ops() const;
+
+    /** Get source tensors to an operator
+     *
+     * @param op
+     * @return std::vector<Id>
+     */
+    std::vector<Id> src_tensors(Id op) const;
+    /** Get destination tensors to an operator
+     *
+     * @param op
+     * @return std::vector<Id>
+     */
+    std::vector<Id> dst_tensors(Id op) const;
+    /** Get source tensors of the whole graph
+     *
+     * @return std::vector<Id>
+     */
+    std::vector<Id> src_tensors() const;
+    /** Get destination tensors of the whole graph
+     *
+     * @return std::vector<Id>
+     */
+    std::vector<Id> dst_tensors() const;
+    /** Get all operators
+     *
+     * @return std::vector<Id>
+     */
+    std::vector<Id> all_ops() const;
+    /** Get all tensors
+     *
+     * @return std::vector<Id>
+     */
+    std::vector<Id> all_tensors() const;
+    /** Number of operators
+     *
+     * @return unsigned int
+     */
+    unsigned int number_of_ops() const;
+    /** Number of tensors
+     *
+     * @return unsigned int
+     */
+    unsigned int number_of_tensors() const;
+
+    /** Update @p merge_point to point to @p t_id
+     *
+     * @param t_id
+     * @param merge_point
+     */
+    Status update_merge_point(Id t_id, Id merge_point);
+
+    /** Strict equality comparison (all internal ids and order of insertion matter).
+     *        In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
+     *
+     *
+     * @param g0
+     * @param g1
+     * @return true
+     * @return false
+     */
+    friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
+    {
+        // Do not compare id allocators
+        return std::make_tuple(
+                   g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops, g0._merge_to_internal)
+               == std::make_tuple(
+                   g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops, g1._merge_to_internal);
+    }
+    void link_input(Id op, Id in_tensor);
+    void link_output(Id op, Id out_tensor);
+    /** Check if there's a path from @p src_tensor to @p dst_op
+     *
+     * @param src_tensor
+     * @param dst_op
+     * @return true
+     * @return false
+     */
+    bool path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const;
+    /** Check if there's a path from @p src_op to @p dst_op
+     *
+     * @param src_op
+     * @param dst_op
+     * @return true
+     * @return false
+     */
+    bool path_exists_from_op_to_op(Id src_op, Id dst_op) const;
+    /** Check if tensor is the src tensor of the entire graph
+     *
+     * @param tensor
+     * @return true
+     * @return false
+     */
+    bool is_src_tensor(Id tensor) const;
+    /** Check if tensor is the dst tensor of the entire graph
+     *
+     * @param tensor
+     * @return true
+     * @return false
+     */
+    bool is_dst_tensor(Id tensor) const;
+
+private:
+    Id   insert_new_tensor();
+    Id   insert_new_op();
+    bool tensor_exists(Id tensor) const;
+    bool operator_exists(Id op) const;
+    bool is_src_tensor_of(Id op, Id tensor) const;
+    bool is_dst_tensor_of(Id op, Id tensor) const;
+    bool are_connected(Id op, Id tensor) const;
+
+private:
+    AdjList _adj_src_tensors{};
+    AdjList _adj_dst_tensors{};
+    AdjList _adj_src_ops{};
+    AdjList _adj_dst_ops{};
+    std::map<Id, Id> _merge_to_internal{}; // From merge tensor to internal tensor
+    SerialIdAllocator _operator_id{};
+    SerialIdAllocator _tensor_id{};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
\ No newline at end of file
diff --git a/arm_compute/core/experimental/IWorkload.h b/arm_compute/core/experimental/IWorkload.h
new file mode 100644
index 0000000000..942dbb70bb
--- /dev/null
+++ b/arm_compute/core/experimental/IWorkload.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+
+#include "arm_compute/core/experimental/DependencyGraph.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Describes when a Unit Workload is run.
+ *
+ */
+struct UnitWorkloadStage
+{
+    enum class Stage
+    {
+        Prepare, /**< Only run once at the beginning. */
+        Run,     /**< Run every time after the first time. */
+    };
+    Stage       stage;
+    friend bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
+    {
+        return stage0.stage == stage1.stage;
+    }
+};
+/** Type of memory used by a Workload Tensor
+ *
+ */
+enum class MemoryType
+{
+    Core      = 0, /**< Core memory used by the Workload Tensor, e.g. for argument tensors */
+    Auxiliary = 1, /**< Auxiliary memory required by the Workload Tensor, e.g. for temporary tensors */
+};
+
+using AuxMemoryLifetime = MemoryLifetime;
+
+/** Memory Info for a @ref WorkloadTensor of Auxiliary memory type. This communicates to the user how much additional
+ *  memory is required for auxiliary tensors
+ */
+struct AuxMemoryInfo
+{
+    AuxMemoryInfo() = default;
+
+    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
+        : size(size),
+          alignment(alignment)
+    {
+    }
+
+    AuxMemoryInfo(AuxMemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
+        : lifetime(lifetime),
+          size(size),
+          alignment(alignment)
+    {
+    }
+    friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
+    {
+        return info0.lifetime == info1.lifetime && info0.size == info1.size && info0.alignment == info1.alignment;
+    }
+
+    AuxMemoryLifetime lifetime{ AuxMemoryLifetime::Temporary }; /**< Memory lifetime*/
+    size_t            size{ 0 };                                /**< Total memory size in bytes */
+    size_t            alignment{ 64 };                          /**< Memory alignment in bytes */
+};
+
+/** A descriptor for IWorkload Tensors.
+ */
+struct WorkloadTensor
+{
+    using Id = DependencyGraph::Id;
+    Id            id{};          /**< Id of the workload tensor */
+    ITensorInfo *info{};         /**< TensorInfo associated with the workload tensor */
+    MemoryType    memory_type{}; /**< Memory type */
+    AuxMemoryInfo memory_info{}; /**< Auxiliary memory information. This can be ignored if the memory type is Core */
+};
+/** The basic atomic unit in an @ref IWorkload. It contains exactly one kernel to run.
+ *
+ */
+struct UnitWorkload
+{
+    using Id = DependencyGraph::Id;
+    Id                id{};    /**< Id of the unit workload */
+    UnitWorkloadStage stage{}; /**< Stage */
+};
+
+/** Run-time-agnostic, platform-specific graph that describes everything required to run a workload
+ *  It can be configured into an Arm Compute Library runtime, integrated into the runtime of another framework, or integrated into the compilation flow
+ */
+struct IWorkload
+{
+    using UnitWorkId     = UnitWorkload::Id;
+    using Tid            = WorkloadTensor::Id;
+    IWorkload()          = default;
+    virtual ~IWorkload() = default;
+    DependencyGraph graph{}; /**< Dependency graph of the workload tensors and the unit workloads */
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
\ No newline at end of file
diff --git a/arm_compute/core/experimental/OperatorGraph.h b/arm_compute/core/experimental/OperatorGraph.h
new file mode 100644
index 0000000000..621a719fe6
--- /dev/null
+++ b/arm_compute/core/experimental/OperatorGraph.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorInfo.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Graph of operators to execute within a Workload. This is a pure descriptive construct.
+ */
+class OperatorGraph final
+{
+public:
+    struct Implementation;
+    OperatorGraph();
+    ~OperatorGraph();
+
+public:
+    Implementation       *impl();
+    const Implementation *impl() const;
+
+private:
+    std::unique_ptr<Implementation> _impl;
+};
+
+/** Return the validity of @p op_graph, usually after performing an operation (e.g. add_tensor) on it
+ *
+ * @param[in,out] op_graph OperatorGraph to be validated
+ *
+ * @return Status
+ */
+Status validate(const OperatorGraph &op_graph);
+
+/** Operator Tensor Handle
+ * This can be either an argument tensor, or an intermediate tensor linking 2 @ref Operator s
+ */
+class OpTensor final
+{
+public:
+    using Id = int;
+    OpTensor(Id id = {});
+    /** Id of the OpTensor
+     * @return Id
+     */
+    Id id() const;
+
+private:
+    Id _id{};
+};
+
+/** Provide order of @ref OpTensor by checking if @p t0 is "lower than" @p t1
+ *
+ * @param[in] t0 OpTensor
+ * @param[in] t1 OpTensor
+ *
+ * @return true   if @p t0 is lower than @p t1
+ * @return false  otherwise
+ */
+bool operator<(const OpTensor &t0, const OpTensor &t1);
+
+/** Associate a TensorInfo with a newly created @ref OpTensor in the @p graph.
+ *
+ * @note @p info needs to remain in scope and valid until the workload has finished building
+ * @note Can pass in an empty TensorInfo for a destination Tensor, in which case @p info will be inferred from the source tensors
+ *
+ * @param[in,out] graph OperatorGraph where the tensor is added
+ * @param[in]     info  TensorInfo to be associated
+ *
+ * @return OpTensor
+ */
+OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info);
+
+/** Operator Handle
+ * This can be used to further modify an existing operator
+ */
+class Operator final
+{
+public:
+    using Id = int;
+    Operator(Id id = {});
+    /** Id of the Operator
+     * @return Id
+     */
+    Id id() const;
+
+private:
+    Id _id{};
+};
+
+/** Provide order of @ref Operator by checking if @p op0 is "lower than" @p op1
+ *
+ * @param[in] op0 Operator
+ * @param[in] op1 Operator
+ *
+ * @return true   if @p op0 is lower than @p op1
+ * @return false  otherwise
+ */
+bool operator<(const Operator &op0, const Operator &op1);
+
+/** Padding information for 2D operations like Conv2dDescriptor
+ */
+struct Padding2D
+{
+    Padding2D() = default;
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom)
+        : left(left), right(right), top(top), bottom(bottom)
+    {
+    }
+    size_t left   = { 0 }; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = { 0 }; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = { 0 }; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = { 0 }; /**<  Padding across the height dimension on the bottom, in elements. */
+};
+
+/** Descriptor for Conv2dDescriptor operation
+ */
+struct Conv2dDescriptor
+{
+    /* TOSA compliant attribute parameters start */
+    Padding2D pad{};
+    Size2D    stride{ 1U, 1U };
+    Size2D    dilation{ 1U, 1U };
+    /* TOSA compliant attribute parameters end */
+    /* Non-TOSA compliant attribute parameters start */
+    /* Non-TOSA compliant attribute parameters end */
+};
+/** Add op Conv2d to @p graph
+ *
+ * @param[in,out] graph   OperatorGraph where the operator is added to
+ * @param[in]     desc    Operator descriptor
+ * @param[in]     input   Input OpTensor
+ * @param[in]     weights Weights OpTensor
+ * @param[in]     bias    (Optional) bias OpTensor
+ * @param[in]     dst     Destination OpTensor
+ *
+ * @return Operator
+ */
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst);
+Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst);
+/** (Only for Debuging and Testing) Force a conv2d method
+ *
+ * @param[in,out] graph  OperatorGraph where conv2d op is located
+ * @param[in]     conv2d Conv2d Op
+ * @param[in]     method Forced ConvolutionMethod
+ */
+void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method);
+
+/** Descriptor for Addition operation
+ *
+ */
+struct AddDescriptor
+{
+    /* TOSA compliant attribute parameters start */
+    /* TOSA compliant attribute parameters end */
+    /* Non-TOSA compliant attribute parameters start */
+    /* Non-TOSA compliant attribute parameters end */
+};
+/** Add op Add to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
+ *
+ * @param[in,out] graph OperatorGraph where the operator is added to
+ * @param[in]     desc  Operator descriptor
+ * @param[in]     lhs   Lhs OpTensor
+ * @param[in]     rhs   Rhs OpTensor
+ * @param[in]     dst   Destination OpTensor
+ *
+ * @return Operator
+ */
+Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst);
+
+bool operator==(const OpTensor &t0, const OpTensor &t1);
+bool operator==(const Padding2D &pad0, const Padding2D &pad1);
+bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1);
+bool operator==(const AddDescriptor &, const AddDescriptor &);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
\ No newline at end of file
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
index c8755dc26c..1995ab045e 100644
--- a/arm_compute/core/experimental/Types.h
+++ b/arm_compute/core/experimental/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,20 +41,22 @@ enum TensorType : int32_t
     ACL_SRC_DST = 0,
 
     // Src
-    ACL_SRC   = 0,
-    ACL_SRC_0 = 0,
-    ACL_SRC_1 = 1,
-    ACL_SRC_2 = 2,
-    ACL_SRC_3 = 3,
-    ACL_SRC_4 = 4,
-    ACL_SRC_5 = 5,
-    ACL_SRC_6 = 6,
+    ACL_SRC     = 0,
+    ACL_SRC_0   = 0,
+    ACL_SRC_1   = 1,
+    ACL_SRC_2   = 2,
+    ACL_SRC_3   = 3,
+    ACL_SRC_4   = 4,
+    ACL_SRC_5   = 5,
+    ACL_SRC_6   = 6,
+    ACL_SRC_END = 6,
 
     // Dst
-    ACL_DST   = 30,
-    ACL_DST_0 = 30,
-    ACL_DST_1 = 31,
-    ACL_DST_2 = 32,
+    ACL_DST     = 30,
+    ACL_DST_0   = 30,
+    ACL_DST_1   = 31,
+    ACL_DST_2   = 32,
+    ACL_DST_END = 32,
 
     // Aux
     ACL_INT     = 50,
-- 
cgit v1.2.1