1 files changed, 220 insertions, 0 deletions
diff --git a/arm_compute/core/experimental/ClWorkload.h b/arm_compute/core/experimental/ClWorkload.h
new file mode 100644
index 0000000000..bcac08b9f7
--- /dev/null
+++ b/arm_compute/core/experimental/ClWorkload.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Window.h"
+
+#include "arm_compute/core/experimental/IWorkload.h"
+#include "arm_compute/core/experimental/OperatorGraph.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Verbose and explicit way to enumerate all the tensor arguments variants used by
+ *  all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
+ */
+enum class ClKernelTensorArgType : int
+{
+    Scalar,
+
+    Vector,
+
+    Image,
+    Image_Reinterpret_As_3D,
+    Image_Export_To_ClImage2D,
+
+    Image_3D, // 3D Tensor represented as a 2D Image + stride_z
+    Image_3D_Export_To_ClImage2D,
+
+    Tensor_3D,
+    Tensor_4D,
+    Tensor_4D_t_Buffer,
+    Tensor_4D_t_Image
+};
+
+/** Describes all the info required to add a kernel argument at run time
+ *
+ *  @note This struct can later be expanded into a more concise and formal way to specify how to set up
+ *  arguments for a kernel inside a @ref ClUnitWorkload
+ */
+struct ClKernelArgDescriptor
+{
+    ClKernelArgDescriptor() = default;
+    ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
+        : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
+    {
+    }
+    ~ClKernelArgDescriptor() = default;
+    friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
+    {
+        return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
+    }
+    int                   arg_id{ -1 };                                    /**< Arg ID in the blueprint, -1 means empty / uninitialized */
+    ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
+    bool                  slide_along_dimz{ true };                        /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
+};
+
+using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;
+
+/** Descriptor containing information required to run a single ClWorkload
+ */
+struct ClExecutionDescriptor
+{
+    cl::NDRange suggested_lws{};              /**< Suggested local work-group size for optimal performance if not zero */
+    cl::NDRange gws{};                        /**< Global work-group to be used */
+    bool        skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
+};
+
+/** Contains kernel code to be compiled and run in a ClUnitWorkload
+ */
+struct ClKernelCode
+{
+    friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
+    {
+        return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
+               && (code0.arguments == code1.arguments);
+    }
+    std::string     name{};          /**< Kernel name */
+    std::string     code{};          /**< Kernel source code */
+    std::string     config_id{};     /**< Generated from blueprint based on complex component */
+    CLBuildOptions  build_options{}; /**< Kernel build options */
+    Window          window{};        /**< Execution window */
+    ClKernelArgList arguments{};     /**< Kernel argument descriptors. map key is kernel ArgumentID */
+};
+
+/** A descriptor of ClWorkload Tensors.
+ */
+struct ClWorkloadTensor : public WorkloadTensor
+{
+    ClWorkloadTensor() = default;
+    ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
+        : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
+    {
+    }
+    ClKernelArgDescriptor kernel_arg{};
+    friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
+    {
+        return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
+    }
+};
+
+/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
+ */
+struct ClUnitWorkload : public UnitWorkload
+{
+    ClUnitWorkload() = default;
+    ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
+        : UnitWorkload{ id, stage }, code{ code }
+    {
+    }
+    friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
+    {
+        return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
+    }
+    ClKernelCode code{};
+};
+
+/** GPU information for @ref ClWorkloadContext
+ */
+struct GpuInfo
+{
+    friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
+    {
+        return info0.target == info1.target;
+    }
+    GPUTarget target{ GPUTarget::UNKNOWN };
+};
+
+/** Context (device capabilities, platform details) associated with a ClWorkload
+ *
+ * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
+ */
+struct ClWorkloadContext
+{
+    friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
+    {
+        return ctx0.gpu_info == ctx1.gpu_info;
+    }
+    GpuInfo gpu_info{};
+};
+
+/** Workload for Cl backend
+ */
+struct ClWorkload : public IWorkload
+{
+    Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
+    {
+        Tid id = graph.add_tensor(merge_point);
+        if(tensors.find(id) == tensors.end())
+        {
+            tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
+        }
+        return id;
+    }
+    UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
+    {
+        auto op            = graph.add_operator(inputs, outputs);
+        auto id            = op.second;
+        unit_workloads[id] = ClUnitWorkload(id, stage, code);
+        return id;
+    }
+    friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
+    {
+        return std::make_tuple(
+                   workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
+               == std::make_tuple(
+                   workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
+    }
+    ClWorkloadContext context{};                             /**< Workload context*/
+    std::map<UnitWorkId, ClUnitWorkload> unit_workloads{};   /**< Unit workloads to run*/
+    std::map<Tid, ClWorkloadTensor>      tensors{};          /**< Workload tensors*/
+    std::map<Tid, OpTensor::Id>          op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
+    Status status{};                                         /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
+};
+
+/** Build a @ref ClWorkload from an @ref OperatorGraph.
+ *
+ * @param[out] workload
+ * @param[in] op_graph
+ * @param[in] ctx
+ * @return Status
+ */
+Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
+\ No newline at end of file