arm_compute/core/experimental/ClWorkload.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220

/*
 * Copyright (c) 2022 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H

#include "arm_compute/core/CL/CLCompileContext.h"
#include "arm_compute/core/GPUTarget.h"
#include "arm_compute/core/Window.h"

#include "arm_compute/core/experimental/IWorkload.h"
#include "arm_compute/core/experimental/OperatorGraph.h"

#include <map>

namespace arm_compute
{
namespace experimental
{
namespace dynamic_fusion
{
/** Verbose and explicit way to enumerate all the tensor arguments variants used by
 *  all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
 */
enum class ClKernelTensorArgType : int
{
    Scalar,

    Vector,

    Image,
    Image_Reinterpret_As_3D,
    Image_Export_To_ClImage2D,

    Image_3D, // 3D Tensor represented as a 2D Image + stride_z
    Image_3D_Export_To_ClImage2D,

    Tensor_3D,
    Tensor_4D,
    Tensor_4D_t_Buffer,
    Tensor_4D_t_Image
};

/** Describes all the info required to add a kernel argument at run time
 *
 *  @note This struct can later be expanded into a more concise and formal way to specify how to set up
 *  arguments for a kernel inside a @ref ClUnitWorkload
 */
struct ClKernelArgDescriptor
{
    ClKernelArgDescriptor() = default;
    ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
        : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
    {
    }
    ~ClKernelArgDescriptor() = default;
    friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
    {
        return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
    }
    int                   arg_id{ -1 };                                    /**< Arg ID in the blueprint, -1 means empty / uninitialized */
    ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
    bool                  slide_along_dimz{ true };                        /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
};

using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;

/** Descriptor containing information required to run a single ClWorkload
 */
struct ClExecutionDescriptor
{
    cl::NDRange suggested_lws{};              /**< Suggested local work-group size for optimal performance if not zero */
    cl::NDRange gws{};                        /**< Global work-group to be used */
    bool        skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
};

/** Contains kernel code to be compiled and run in a ClUnitWorkload
 */
struct ClKernelCode
{
    friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
    {
        return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
               && (code0.arguments == code1.arguments);
    }
    std::string     name{};          /**< Kernel name */
    std::string     code{};          /**< Kernel source code */
    std::string     config_id{};     /**< Generated from blueprint based on complex component */
    CLBuildOptions  build_options{}; /**< Kernel build options */
    Window          window{};        /**< Execution window */
    ClKernelArgList arguments{};     /**< Kernel argument descriptors. map key is kernel ArgumentID */
};

/** A descriptor of ClWorkload Tensors.
 */
struct ClWorkloadTensor : public WorkloadTensor
{
    ClWorkloadTensor() = default;
    ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
        : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
    {
    }
    ClKernelArgDescriptor kernel_arg{};
    friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
    {
        return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
    }
};

/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
 */
struct ClUnitWorkload : public UnitWorkload
{
    ClUnitWorkload() = default;
    ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
        : UnitWorkload{ id, stage }, code{ code }
    {
    }
    friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
    {
        return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
    }
    ClKernelCode code{};
};

/** GPU information for @ref ClWorkloadContext
 */
struct GpuInfo
{
    friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
    {
        return info0.target == info1.target;
    }
    GPUTarget target{ GPUTarget::UNKNOWN };
};

/** Context (device capabilities, platform details) associated with a ClWorkload
 *
 * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
 */
struct ClWorkloadContext
{
    friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
    {
        return ctx0.gpu_info == ctx1.gpu_info;
    }
    GpuInfo gpu_info{};
};

/** Workload for Cl backend
 */
struct ClWorkload : public IWorkload
{
    Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
    {
        Tid id = graph.add_tensor(merge_point);
        if(tensors.find(id) == tensors.end())
        {
            tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
        }
        return id;
    }
    UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
    {
        auto op            = graph.add_operator(inputs, outputs);
        auto id            = op.second;
        unit_workloads[id] = ClUnitWorkload(id, stage, code);
        return id;
    }
    friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
    {
        return std::make_tuple(
                   workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
               == std::make_tuple(
                   workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
    }
    ClWorkloadContext context{};                             /**< Workload context*/
    std::map<UnitWorkId, ClUnitWorkload> unit_workloads{};   /**< Unit workloads to run*/
    std::map<Tid, ClWorkloadTensor>      tensors{};          /**< Workload tensors*/
    std::map<Tid, OpTensor::Id>          op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
    Status status{};                                         /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
};

/** Build a @ref ClWorkload from an @ref OperatorGraph.
 *
 * @param[out] workload
 * @param[in] op_graph
 * @param[in] ctx
 * @return Status
 */
Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);

} // namespace dynamic_fusion
} // namespace experimental
} // namespace arm_compute

#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H