From b63b1196adea8b07dd8db77c2492a212650deba0 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Fri, 28 Jan 2022 18:24:39 +0000 Subject: Integrate Dynamic Fusion patches * Add public interfaces: * OperatorGraph: Describe a workload that could contain fused kernels * IWorkload: Generic interface for workloads built from OperatorGraph * ClWorkload: OpenCL workloads built from OperatorGraph * ClCompositeOperator: Runtime async operator to execute a ClWorkload * DependencyGraph (will likely be deprecated in later iterations) * Add example * cl_fused_conv2d_elementwise_add.cpp to explain how to use the new interfaces * Add internal translation layer * Refactor ClKernelBuildingAPI * Remove non-tile based gemm native kernel component * Minor interface changes * Add integration tests Resolves COMPMID-5161 Signed-off-by: SiCong Li Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510 Reviewed-by: Gian Marco Iodice Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 8 +- arm_compute/core/TensorInfo.h | 18 +- arm_compute/core/Types.h | 17 +- arm_compute/core/Window.h | 21 +- arm_compute/core/Window.inl | 7 +- arm_compute/core/experimental/ClWorkload.h | 220 ++++++++ arm_compute/core/experimental/DependencyGraph.h | 278 +++++++++++ arm_compute/core/experimental/IWorkload.h | 133 +++++ arm_compute/core/experimental/OperatorGraph.h | 211 ++++++++ arm_compute/core/experimental/Types.h | 28 +- arm_compute/runtime/CL/CLScheduler.h | 7 +- arm_compute/runtime/CL/CLTuner.h | 2 +- arm_compute/runtime/CL/ICLTuner.h | 3 +- .../runtime/experimental/ClCompositeOperator.h | 191 +++++++ docs/DoxygenLayout.xml | 2 +- examples/SConscript | 11 +- .../cl_fused_conv2d_elementwise_add.cpp | 386 ++++++++++++++ .../cl_ref_conv2d_elementwise_add.cpp | 223 +++++++++ filelist.json | 11 +- src/core/CL/ICLKernel.h | 2 +- .../dynamic_fusion/ClKernelBuildingAPI.cpp | 79 ++- .../dynamic_fusion/ClKernelBuildingAPI.h | 201 +------- .../dynamic_fusion/ClKernelBuildingImpl/Common.h | 366 +++++++++----- .../dynamic_fusion/ClKernelBuildingImpl/Utils.h | 8 +- .../ClDirectConvolutionKernelComponent.cpp | 202 ++++---- .../ClDirectConvolutionKernelComponent.h | 23 +- .../components/ClElementwiseAddKernelComponent.cpp | 153 +++--- .../components/ClElementwiseAddKernelComponent.h | 13 +- .../components/ClGemmNativeKernelComponent.cpp | 555 --------------------- .../components/ClGemmNativeKernelComponent.h | 83 --- .../components/ClKernelComponents.h | 9 +- .../components/ClStoreKernelComponents.cpp | 81 +-- .../components/ClStoreKernelComponents.h | 20 +- .../experimental/dynamic_fusion/OperatorGraph.cpp | 236 +++++++++ .../WorkloadImpl/ClFusedKernelGraph.cpp | 233 +++++++++ .../WorkloadImpl/ClFusedKernelGraph.h | 453 +++++++++++++++++ .../WorkloadImpl/ClKernelDescriptors.h | 112 +++++ .../dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp | 219 ++++++++ .../dynamic_fusion/WorkloadImpl/ClKernelGraph.h | 240 +++++++++ .../dynamic_fusion/WorkloadImpl/ClWorkload.cpp | 73 +++ .../WorkloadImpl/DependencyGraph.cpp | 431 ++++++++++++++++ .../dynamic_fusion/WorkloadImpl/ITensorDescPack.h | 242 +++++++++ .../WorkloadImpl/OperatorGraphImpl.cpp | 387 ++++++++++++++ .../WorkloadImpl/OperatorGraphImpl.h | 229 +++++++++ .../dynamic_fusion/ClCompositeKernel.cpp | 64 ++- .../dynamic_fusion/ClCompositeKernel.h | 48 +- .../dynamic_fusion/ClCompositeOperator.cpp | 242 +++++++++ src/runtime/CL/CLScheduler.cpp | 4 +- src/runtime/CL/CLTuner.cpp | 6 +- support/DeepCopy.h | 203 ++++++++ tests/SConscript | 14 + .../CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | 515 ++----------------- .../CL/UNIT/dynamic_fusion/DependencyGraph.cpp | 267 ++++++++++ .../Integration_OperatorFuseMovenetSubGraph1.cpp | 403 +++++++++++++++ tests/validation/CL/UNIT/dynamic_fusion/Utils.h | 71 +++ 55 files changed, 6509 insertions(+), 1755 deletions(-) create mode 100644 arm_compute/core/experimental/ClWorkload.h create mode 100644 arm_compute/core/experimental/DependencyGraph.h create mode 100644 arm_compute/core/experimental/IWorkload.h create mode 100644 arm_compute/core/experimental/OperatorGraph.h create mode 100644 arm_compute/runtime/experimental/ClCompositeOperator.h create mode 100644 examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp create mode 100644 examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h create mode 100644 src/core/experimental/dynamic_fusion/OperatorGraph.cpp create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp create mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h create mode 100644 src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp create mode 100644 support/DeepCopy.h create mode 100644 tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp create mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp create mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Utils.h diff --git a/Android.bp b/Android.bp index c072c0e371..d1efc0a632 100644 --- a/Android.bp +++ b/Android.bp @@ -371,8 +371,13 @@ cc_library_static { "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp", + "src/core/experimental/dynamic_fusion/OperatorGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp", "src/core/helpers/SoftmaxHelpers.cpp", "src/core/helpers/WindowHelpers.cpp", "src/core/utils/AssemblyUtils.cpp", @@ -674,6 +679,7 @@ cc_library_static { "src/gpu/cl/operators/ClSub.cpp", "src/gpu/cl/operators/ClTranspose.cpp", "src/gpu/cl/operators/ClWinogradConv2d.cpp", + "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp", "src/runtime/Allocator.cpp", "src/runtime/BlobLifetimeManager.cpp", "src/runtime/BlobMemoryPool.cpp", diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h index 9bc86806fb..40f9ed9806 100644 --- a/arm_compute/core/TensorInfo.h +++ b/arm_compute/core/TensorInfo.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -297,6 +297,7 @@ public: _are_values_constant = are_values_constant; return *this; } + inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs); private: /** Calculates strides, offset and total size resulting from the specified padding around the XY plane. @@ -320,5 +321,20 @@ private: DataLayout _data_layout; bool _are_values_constant; }; + +/** Check whether two tensor info are equal. + * + * @param[in] lhs LHS tensor info. + * @param[in] rhs RHS tensor info. + * + * @return True if the given tensor infos are the same. + */ +inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs) +{ + return (lhs._total_size == rhs._total_size) && (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) && (lhs._strides_in_bytes == rhs._strides_in_bytes) + && (lhs._num_channels == rhs._num_channels) && (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) && (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format) + && (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) && (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info) + && (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant); +} } // namespace arm_compute #endif /*ARM_COMPUTE_TENSORINFO_H */ diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 1548816e91..7ae6a7e67e 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -253,9 +253,22 @@ struct ValidRegion return *this; } + /** Check whether two valid regions are equal. + * + * @param[in] lhs LHS valid region + * @param[in] rhs RHS valid region + * + * @return True if the valid regions are the same. + */ + inline friend bool operator==(const ValidRegion &lhs, const ValidRegion &rhs); + Coordinates anchor; /**< Anchor for the start of the valid region. */ TensorShape shape; /**< Shape of the valid region. */ }; +inline bool operator==(const ValidRegion &lhs, const ValidRegion &rhs) +{ + return (lhs.anchor == rhs.anchor) && (lhs.shape == rhs.shape); +} /** Methods available to handle borders */ enum class BorderMode @@ -346,7 +359,7 @@ struct BorderSize * * @return true if they are equal */ - bool operator==(const BorderSize &rhs) + bool operator==(const BorderSize &rhs) const { return (top == rhs.top) && (right == rhs.right) && (bottom == rhs.bottom) && (left == rhs.left); } @@ -357,7 +370,7 @@ struct BorderSize * * @return true if they are different */ - bool operator!=(const BorderSize &rhs) + bool operator!=(const BorderSize &rhs) const { return !(*this == rhs); } diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h index f603e6c148..c566cffa88 100644 --- a/arm_compute/core/Window.h +++ b/arm_compute/core/Window.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -123,6 +123,17 @@ public: { _end = end; } + /** Check whether two Dimensions are equal. + * + * @param[in] lhs LHS Dimensions + * @param[in] rhs RHS Dimensions + * + * @return True if the Dimensions are the same. + */ + friend bool operator==(const Dimension &lhs, const Dimension &rhs) + { + return (lhs._start == rhs._start) && (lhs._end == rhs._end) && (lhs._step == rhs._step); + } private: int _start; /**< Start of the dimension */ @@ -414,6 +425,14 @@ public: * @param[in] rhs Second window to swap. */ friend void swap(Window &lhs, Window &rhs); + /** Check whether two Windows are equal. + * + * @param[in] lhs LHS window + * @param[in] rhs RHS window + * + * @return True if the given windows are the same. + */ + friend bool operator==(const Window &lhs, const Window &rhs); private: /** First slice of the window diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl index 6100d09a1c..5ee4b57145 100644 --- a/arm_compute/core/Window.inl +++ b/arm_compute/core/Window.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2020, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -305,4 +305,9 @@ inline void swap(Window &lhs, Window &rhs) { lhs._dims.swap(rhs._dims); } + +inline bool operator==(const Window &lhs, const Window &rhs) +{ + return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted); +} } // namespace arm_compute diff --git a/arm_compute/core/experimental/ClWorkload.h b/arm_compute/core/experimental/ClWorkload.h new file mode 100644 index 0000000000..bcac08b9f7 --- /dev/null +++ b/arm_compute/core/experimental/ClWorkload.h @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/Window.h" + +#include "arm_compute/core/experimental/IWorkload.h" +#include "arm_compute/core/experimental/OperatorGraph.h" + +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Verbose and explicit way to enumerate all the tensor arguments variants used by + * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed + */ +enum class ClKernelTensorArgType : int +{ + Scalar, + + Vector, + + Image, + Image_Reinterpret_As_3D, + Image_Export_To_ClImage2D, + + Image_3D, // 3D Tensor represented as a 2D Image + stride_z + Image_3D_Export_To_ClImage2D, + + Tensor_3D, + Tensor_4D, + Tensor_4D_t_Buffer, + Tensor_4D_t_Image +}; + +/** Describes all the info required to add a kernel argument at run time + * + * @note This struct can later be expanded into a more concise and formal way to specify how to set up + * arguments for a kernel inside a @ref ClUnitWorkload + */ +struct ClKernelArgDescriptor +{ + ClKernelArgDescriptor() = default; + ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true) + : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz } + { + } + ~ClKernelArgDescriptor() = default; + friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1) + { + return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz); + } + int arg_id{ -1 }; /**< Arg ID in the blueprint, -1 means empty / uninitialized */ + ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */ + bool slide_along_dimz{ true }; /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */ +}; + +using ClKernelArgList = std::map; + +/** Descriptor containing information required to run a single ClWorkload + */ +struct ClExecutionDescriptor +{ + cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */ + cl::NDRange gws{}; /**< Global work-group to be used */ + bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */ +}; + +/** Contains kernel code to be compiled and run in a ClUnitWorkload + */ +struct ClKernelCode +{ + friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1) + { + return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window) + && (code0.arguments == code1.arguments); + } + std::string name{}; /**< Kernel name */ + std::string code{}; /**< Kernel source code */ + std::string config_id{}; /**< Generated from blueprint based on complex component */ + CLBuildOptions build_options{}; /**< Kernel build options */ + Window window{}; /**< Execution window */ + ClKernelArgList arguments{}; /**< Kernel argument descriptors. map key is kernel ArgumentID */ +}; + +/** A descriptor of ClWorkload Tensors. + */ +struct ClWorkloadTensor : public WorkloadTensor +{ + ClWorkloadTensor() = default; + ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg) + : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg } + { + } + ClKernelArgDescriptor kernel_arg{}; + friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1) + { + return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg; + } +}; + +/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run. + */ +struct ClUnitWorkload : public UnitWorkload +{ + ClUnitWorkload() = default; + ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code) + : UnitWorkload{ id, stage }, code{ code } + { + } + friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1) + { + return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code; + } + ClKernelCode code{}; +}; + +/** GPU information for @ref ClWorkloadContext + */ +struct GpuInfo +{ + friend bool operator==(const GpuInfo &info0, const GpuInfo &info1) + { + return info0.target == info1.target; + } + GPUTarget target{ GPUTarget::UNKNOWN }; +}; + +/** Context (device capabilities, platform details) associated with a ClWorkload + * + * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers) + */ +struct ClWorkloadContext +{ + friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1) + { + return ctx0.gpu_info == ctx1.gpu_info; + } + GpuInfo gpu_info{}; +}; + +/** Workload for Cl backend + */ +struct ClWorkload : public IWorkload +{ + Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point) + { + Tid id = graph.add_tensor(merge_point); + if(tensors.find(id) == tensors.end()) + { + tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg); + } + return id; + } + UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector &inputs, const std::vector &outputs) + { + auto op = graph.add_operator(inputs, outputs); + auto id = op.second; + unit_workloads[id] = ClUnitWorkload(id, stage, code); + return id; + } + friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1) + { + return std::make_tuple( + workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut) + == std::make_tuple( + workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut); + } + ClWorkloadContext context{}; /**< Workload context*/ + std::map unit_workloads{}; /**< Unit workloads to run*/ + std::map tensors{}; /**< Workload tensors*/ + std::map op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/ + Status status{}; /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */ +}; + +/** Build a @ref ClWorkload from an @ref OperatorGraph. + * + * @param[out] workload + * @param[in] op_graph + * @param[in] ctx + * @return Status + */ +Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H \ No newline at end of file diff --git a/arm_compute/core/experimental/DependencyGraph.h b/arm_compute/core/experimental/DependencyGraph.h new file mode 100644 index 0000000000..794bf0e344 --- /dev/null +++ b/arm_compute/core/experimental/DependencyGraph.h @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H + +#include "arm_compute/core/Error.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +template +bool is_in(const T &v, const std::vector &vec) +{ + return std::find(std::begin(vec), std::end(vec), v) != std::end(vec); +} + +/** The dependency graph of a workload, where the nodes are of 2 types: Tensor or Operator + * Represented as a doubly-linked adjacency list with the differentiation between source and destination + * + * A "Merge Tensor" is an external tensor associated with the tensor within the graph, and serve as a merge point + */ +class DependencyGraph +{ +public: + /** A serial Id allocator + * + */ + class SerialIdAllocator + { + public: + using Id = int; + Id alloc() + { + return _counter++; + } + constexpr static Id empty() + { + return -1; + } + + private: + Id _counter{ 0 }; + }; + using Id = SerialIdAllocator::Id; + /** Adjacency list + * + */ + using AdjList = std::map>; + + /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order + * + */ + struct OpPack + { + Id op{}; + std::vector inputs{}; + std::vector outputs{}; + friend bool operator==(const OpPack &opp0, const OpPack &opp1) + { + return std::make_tuple( + opp0.op, opp0.inputs, opp0.outputs) + == std::make_tuple( + opp1.op, opp1.inputs, opp1.outputs); + } + }; + +public: + constexpr static Id empty_id() + { + return SerialIdAllocator::empty(); + } + + DependencyGraph() = default; + // Used in cases where two DependencyGraphs may want to share the same configuration of tensors + explicit DependencyGraph(const std::vector &imported_tensors); + // Testing only + DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map merge_points = {}); + + /** Add a new tensor + * + * @param merge_tensor The external merge point associated with the tensor. Leave empty if not needed. + * @return Id The newly allocated tensor, or a previously added tensor associated with @p merge_tensor + */ + Id add_tensor(Id merge_tensor = empty_id()); + + void remove_tensor(Id tensor); + + /** Add a new operator + * + * @param inputs Input tensors to the operator + * @param outputs Output tensors to the operator + * @return std::pair where id is the newly allocated operator + */ + std::pair add_operator(const std::vector &inputs, const std::vector &outputs); + + void remove_operator(Id op); + /** Sort the graph in a topological order + * + * @return std::pair> + */ + std::pair> topological_sort() const; + + std::vector src_ops(Id op) const; + std::vector dst_ops(Id op) const; + + std::vector src_ops_from_tensor(Id tensor) const; + std::vector dst_ops_from_tensor(Id tensor) const; + /** Get the merge points object + * + * @return std::map + */ + std::map get_merge_points() const; + /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph + * + * @return std::vector + */ + std::vector get_root_ops() const; + /** Get all dst ops of the whole graph + * + * @return std::vector + */ + std::vector get_dst_ops() const; + + /** Get source tensors to an operator + * + * @param op + * @return std::vector + */ + std::vector src_tensors(Id op) const; + /** Get destination tensors to an operator + * + * @param op + * @return std::vector + */ + std::vector dst_tensors(Id op) const; + /** Get source tensors of the whole graph + * + * @return std::vector + */ + std::vector src_tensors() const; + /** Get destination tensors of the whole graph + * + * @return std::vector + */ + std::vector dst_tensors() const; + /** Get all operators + * + * @return std::vector + */ + std::vector all_ops() const; + /** Get all tensors + * + * @return std::vector + */ + std::vector all_tensors() const; + /** Number of operators + * + * @return unsigned int + */ + unsigned int number_of_ops() const; + /** Number of tensors + * + * @return unsigned int + */ + unsigned int number_of_tensors() const; + + /** Update @p merge_point to point to @p t_id + * + * @param t_id + * @param merge_point + */ + Status update_merge_point(Id t_id, Id merge_point); + + /** Strict equality comparison (all internal ids and order of insertion matter). + * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal + * + * + * @param g0 + * @param g1 + * @return true + * @return false + */ + friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1) + { + // Do not compare id allocators + return std::make_tuple( + g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops, g0._merge_to_internal) + == std::make_tuple( + g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops, g1._merge_to_internal); + } + void link_input(Id op, Id in_tensor); + void link_output(Id op, Id out_tensor); + /** Check if there's a path from @p src_tensor to @p dst_op + * + * @param src_tensor + * @param dst_op + * @return true + * @return false + */ + bool path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const; + /** Check if there's a path from @p src_op to @p dst_op + * + * @param src_op + * @param dst_op + * @return true + * @return false + */ + bool path_exists_from_op_to_op(Id src_op, Id dst_op) const; + /** Check if tensor is the src tensor of the entire graph + * + * @param tensor + * @return true + * @return false + */ + bool is_src_tensor(Id tensor) const; + /** Check if tensor is the dst tensor of the entire graph + * + * @param tensor + * @return true + * @return false + */ + bool is_dst_tensor(Id tensor) const; + +private: + Id insert_new_tensor(); + Id insert_new_op(); + bool tensor_exists(Id tensor) const; + bool operator_exists(Id op) const; + bool is_src_tensor_of(Id op, Id tensor) const; + bool is_dst_tensor_of(Id op, Id tensor) const; + bool are_connected(Id op, Id tensor) const; + +private: + AdjList _adj_src_tensors{}; + AdjList _adj_dst_tensors{}; + AdjList _adj_src_ops{}; + AdjList _adj_dst_ops{}; + std::map _merge_to_internal{}; // From merge tensor to internal tensor + SerialIdAllocator _operator_id{}; + SerialIdAllocator _tensor_id{}; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H \ No newline at end of file diff --git a/arm_compute/core/experimental/IWorkload.h b/arm_compute/core/experimental/IWorkload.h new file mode 100644 index 0000000000..942dbb70bb --- /dev/null +++ b/arm_compute/core/experimental/IWorkload.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/experimental/Types.h" + +#include "arm_compute/core/experimental/DependencyGraph.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Describes when a Unit Workload is run. + * + */ +struct UnitWorkloadStage +{ + enum class Stage + { + Prepare, /**< Only run once at the beginning. */ + Run, /**< Run every time after the first time. */ + }; + Stage stage; + friend bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1) + { + return stage0.stage == stage1.stage; + } +}; +/** Type of memory used by a Workload Tensor + * + */ +enum class MemoryType +{ + Core = 0, /**< Core memory used by the Workload Tensor, e.g. for argument tensors */ + Auxiliary = 1, /**< Auxiliary memory required by the Workload Tensor, e.g. for temporary tensors */ +}; + +using AuxMemoryLifetime = MemoryLifetime; + +/** Memory Info for a @ref WorkloadTensor of Auxiliary memory type. This communicates to the user how much additional + * memory is required for auxiliary tensors + */ +struct AuxMemoryInfo +{ + AuxMemoryInfo() = default; + + AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept + : size(size), + alignment(alignment) + { + } + + AuxMemoryInfo(AuxMemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept + : lifetime(lifetime), + size(size), + alignment(alignment) + { + } + friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1) + { + return info0.lifetime == info1.lifetime && info0.size == info1.size && info0.alignment == info1.alignment; + } + + AuxMemoryLifetime lifetime{ AuxMemoryLifetime::Temporary }; /**< Memory lifetime*/ + size_t size{ 0 }; /**< Total memory size in bytes */ + size_t alignment{ 64 }; /**< Memory alignment in bytes */ +}; + +/** A descriptor for IWorkload Tensors. + */ +struct WorkloadTensor +{ + using Id = DependencyGraph::Id; + Id id{}; /**< Id of the workload tensor */ + ITensorInfo *info{}; /**< TensorInfo associated with the workload tensor */ + MemoryType memory_type{}; /**< Memory type */ + AuxMemoryInfo memory_info{}; /**< Auxiliary memory information. This can be ignored if the memory type is Core */ +}; +/** The basic atomic unit in an @ref IWorkload. It contains exactly one kernel to run. + * + */ +struct UnitWorkload +{ + using Id = DependencyGraph::Id; + Id id{}; /**< Id of the unit workload */ + UnitWorkloadStage stage{}; /**< Stage */ +}; + +/** Run-time-agnostic, platform-specific graph that describes everything required to run a workload + * It can be configured into an Arm Compute Library runtime, integrated into the runtime of another framework, or integrated into the compilation flow + */ +struct IWorkload +{ + using UnitWorkId = UnitWorkload::Id; + using Tid = WorkloadTensor::Id; + IWorkload() = default; + virtual ~IWorkload() = default; + DependencyGraph graph{}; /**< Dependency graph of the workload tensors and the unit workloads */ +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H \ No newline at end of file diff --git a/arm_compute/core/experimental/OperatorGraph.h b/arm_compute/core/experimental/OperatorGraph.h new file mode 100644 index 0000000000..621a719fe6 --- /dev/null +++ b/arm_compute/core/experimental/OperatorGraph.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ + +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensorInfo.h" + +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Graph of operators to execute within a Workload. This is a pure descriptive construct. + */ +class OperatorGraph final +{ +public: + struct Implementation; + OperatorGraph(); + ~OperatorGraph(); + +public: + Implementation *impl(); + const Implementation *impl() const; + +private: + std::unique_ptr _impl; +}; + +/** Return the validity of @p op_graph, usually after performing an operation (e.g. add_tensor) on it + * + * @param[in,out] op_graph OperatorGraph to be validated + * + * @return Status + */ +Status validate(const OperatorGraph &op_graph); + +/** Operator Tensor Handle + * This can be either an argument tensor, or an intermediate tensor linking 2 @ref Operator s + */ +class OpTensor final +{ +public: + using Id = int; + OpTensor(Id id = {}); + /** Id of the OpTensor + * @return Id + */ + Id id() const; + +private: + Id _id{}; +}; + +/** Provide order of @ref OpTensor by checking if @p t0 is "lower than" @p t1 + * + * @param[in] t0 OpTensor + * @param[in] t1 OpTensor + * + * @return true if @p t0 is lower than @p t1 + * @return false otherwise + */ +bool operator<(const OpTensor &t0, const OpTensor &t1); + +/** Associate a TensorInfo with a newly created @ref OpTensor in the @p graph. + * + * @note @p info needs to remain in scope and valid until the workload has finished building + * @note Can pass in an empty TensorInfo for a destination Tensor, in which case @p info will be inferred from the source tensors + * + * @param[in,out] graph OperatorGraph where the tensor is added + * @param[in] info TensorInfo to be associated + * + * @return OpTensor + */ +OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info); + +/** Operator Handle + * This can be used to further modify an existing operator + */ +class Operator final +{ +public: + using Id = int; + Operator(Id id = {}); + /** Id of the Operator + * @return Id + */ + Id id() const; + +private: + Id _id{}; +}; + +/** Provide order of @ref Operator by checking if @p op0 is "lower than" @p op1 + * + * @param[in] op0 Operator + * @param[in] op1 Operator + * + * @return true if @p op0 is lower than @p op1 + * @return false otherwise + */ +bool operator<(const Operator &op0, const Operator &op1); + +/** Padding information for 2D operations like Conv2dDescriptor + */ +struct Padding2D +{ + Padding2D() = default; + Padding2D(size_t left, size_t right, size_t top, size_t bottom) + : left(left), right(right), top(top), bottom(bottom) + { + } + size_t left = { 0 }; /**< Padding across the width dimension on the left, in elements. */ + size_t right = { 0 }; /**< Padding across the width dimension on the right, in elements. */ + size_t top = { 0 }; /**< Padding across the height dimension on the top, in elements. */ + size_t bottom = { 0 }; /**< Padding across the height dimension on the bottom, in elements. */ +}; + +/** Descriptor for Conv2dDescriptor operation + */ +struct Conv2dDescriptor +{ + /* TOSA compliant attribute parameters start */ + Padding2D pad{}; + Size2D stride{ 1U, 1U }; + Size2D dilation{ 1U, 1U }; + /* TOSA compliant attribute parameters end */ + /* Non-TOSA compliant attribute parameters start */ + /* Non-TOSA compliant attribute parameters end */ +}; +/** Add op Conv2d to @p graph + * + * @param[in,out] graph OperatorGraph where the operator is added to + * @param[in] desc Operator descriptor + * @param[in] input Input OpTensor + * @param[in] weights Weights OpTensor + * @param[in] bias (Optional) bias OpTensor + * @param[in] dst Destination OpTensor + * + * @return Operator + */ +Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst); +Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst); +/** (Only for Debuging and Testing) Force a conv2d method + * + * @param[in,out] graph OperatorGraph where conv2d op is located + * @param[in] conv2d Conv2d Op + * @param[in] method Forced ConvolutionMethod + */ +void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method); + +/** Descriptor for Addition operation + * + */ +struct AddDescriptor +{ + /* TOSA compliant attribute parameters start */ + /* TOSA compliant attribute parameters end */ + /* Non-TOSA compliant attribute parameters start */ + /* Non-TOSA compliant attribute parameters end */ +}; +/** Add op Add to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s + * + * @param[in,out] graph OperatorGraph where the operator is added to + * @param[in] desc Operator descriptor + * @param[in] lhs Lhs OpTensor + * @param[in] rhs Rhs OpTensor + * @param[in] dst Destination OpTensor + * + * @return Operator + */ +Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst); + +bool operator==(const OpTensor &t0, const OpTensor &t1); +bool operator==(const Padding2D &pad0, const Padding2D &pad1); +bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1); +bool operator==(const AddDescriptor &, const AddDescriptor &); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH \ No newline at end of file diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h index c8755dc26c..1995ab045e 100644 --- a/arm_compute/core/experimental/Types.h +++ b/arm_compute/core/experimental/Types.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -41,20 +41,22 @@ enum TensorType : int32_t ACL_SRC_DST = 0, // Src - ACL_SRC = 0, - ACL_SRC_0 = 0, - ACL_SRC_1 = 1, - ACL_SRC_2 = 2, - ACL_SRC_3 = 3, - ACL_SRC_4 = 4, - ACL_SRC_5 = 5, - ACL_SRC_6 = 6, + ACL_SRC = 0, + ACL_SRC_0 = 0, + ACL_SRC_1 = 1, + ACL_SRC_2 = 2, + ACL_SRC_3 = 3, + ACL_SRC_4 = 4, + ACL_SRC_5 = 5, + ACL_SRC_6 = 6, + ACL_SRC_END = 6, // Dst - ACL_DST = 30, - ACL_DST_0 = 30, - ACL_DST_1 = 31, - ACL_DST_2 = 32, + ACL_DST = 30, + ACL_DST_0 = 30, + ACL_DST_1 = 31, + ACL_DST_2 = 32, + ACL_DST_END = 32, // Aux ACL_INT = 50, diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h index 5bfaaf4b5d..3919635d1b 100644 --- a/arm_compute/runtime/CL/CLScheduler.h +++ b/arm_compute/runtime/CL/CLScheduler.h @@ -42,7 +42,6 @@ namespace experimental { namespace dynamic_fusion { -struct TensorBinding; struct ClExecutionDescriptor; } // namespace dynamic_fusion } // namespace experimental @@ -113,15 +112,13 @@ public: #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Schedule the execution of the passed kernel if possible. - * Use TensorBinding instead of ITensorPack for working with dynamic fusion - * @note Does not support dynamic tuning yet * * @param[in] kernel Kernel to execute. * @param[in] tensors Map containing the tensors to operate on. * @param[in] exec_desc Execution descriptor * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled. */ - void enqueue_op(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush = true); + void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush = true); #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) @@ -218,7 +215,7 @@ private: void flush_queue(bool flush); #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush); + void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush); #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Flag to ensure symbols initialisation is happening before Scheduler creation */ diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index e595f8f34b..88933fc2d8 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -125,7 +125,7 @@ public: void tune_kernel_dynamic(ICLKernel &kernel) override; void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override; #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override; + void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override; #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Is the kernel_event set ? diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h index a327497255..e0ee3ffe71 100644 --- a/arm_compute/runtime/CL/ICLTuner.h +++ b/arm_compute/runtime/CL/ICLTuner.h @@ -35,7 +35,6 @@ namespace experimental { namespace dynamic_fusion { -struct TensorBinding; struct ClExecutionDescriptor; } // namespace dynamic_fusion } // namespace experimental @@ -74,7 +73,7 @@ public: * @param[in, out] tensors Tensors for the kernel to use * @param[in] exec_desc Execution descriptor */ - virtual void tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0; + virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0; #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) }; } // namespace arm_compute diff --git a/arm_compute/runtime/experimental/ClCompositeOperator.h b/arm_compute/runtime/experimental/ClCompositeOperator.h new file mode 100644 index 0000000000..b903bc0ee6 --- /dev/null +++ b/arm_compute/runtime/experimental/ClCompositeOperator.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H + +#include "arm_compute/core/CL/CLCompileContext.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IOperator.h" + +#include "arm_compute/core/experimental/ClWorkload.h" + +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Map OpTensor handles to their corresponding ITensor memory + */ +using OpTensorBinding = std::map; + +/** Map a kernel (as identified by its unit workload id) to its corresponding tensor pack + * + * @note External user should not use the add_tensor_pack method to alter this tensor pack map, and should only use the map returned by @ref bind_tensors + */ +class TensorPackMap +{ +public: + /** Find a tensor pack associated with the unit workload Id @p uwk_id + * + * @param[in] uwk_id unit workload Id associated with the tensor pack + * + * @return ITensorPack* + */ + ITensorPack *find_tensor_pack(UnitWorkload::Id uwk_id); + /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found. + * + * @param[in] uwk_id unit workload Id associated with the tensor pack + * + * @return ITensorPack* + */ + ITensorPack &get_tensor_pack(UnitWorkload::Id uwk_id); + /** Add a tensor pack and associate it with unit workload Id @p uwk_id + * @note Should not be used by external user + * + * @param[in] uwk_id unit workload Id associated with the tensor pack + * @param[in] tensor_pack Tensor Pack to be added + */ + void add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack); + +private: + std::map _tensor_packs{}; +}; + +/** Holder of any auxiliary CLTensors required by a ClWorkload. + * + * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the TensorInfo and AuxMemoryInfo + * + * @note This data holder must remain valid until the ClCompositeOperator that it's passed to is out of scope + * + * @note External user should not use the add_aux_tensor method, and should only use the data returned by @ref bind_tensors + */ +class ClAuxTensorData +{ +public: + /** A view of a single auxiliary data and the associated TensorInfo and AuxMemoryInfo + */ + struct DataView + { + DataView() = default; + DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info) + : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info } + { + } + ~DataView() = default; + DataView(const DataView &other) = default; + DataView &operator=(const DataView &other) = default; + DataView(DataView &&other) = default; + DataView &operator=(DataView &&other) = default; + CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */ + TensorInfo tensor_info{}; /**< Associated TensorInfo */ + AuxMemoryInfo memory_info{}; /**< Memory requirement */ + }; + + /** Add auxiliary tensor. + * + * @note Should not be used by external user + * + * @param[in] tensor_id Any Id that can uniquely identify an auxiliary tensor. Usually ClWorkloadTensor Id + * @param[in] tensor_info TensorInfo associated with the tensor + * @param[in] memory_info Memory requirements + * + * @return CLTensor* if successfully added, otherwise nullptr + */ + CLTensor *add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info); + + /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. + * + * @return std::vector& + */ + std::vector &get_tensors(); + +private: + std::map> _owned_tensors{}; + std::vector _tensors{}; +}; + +/** Bind tensor memory to packs used by prepare and run methods. Create auxiliary tensor objects and their memory requirements if needed + * + * @note This is the only method for external user to create ClAuxTensorData, and the prepare and run TensorPackMaps + * + * @param[out] aux_tensor_data Auxiliary Tensors required by the workload + * @param[out] prepare_pack_map TensorPackMap used by the prepare method + * @param[out] run_pack_map TensorPackMap used by the run method + * @param[in] workload ClWorkload to bind the tensors to + * @param[in] op_tensors CLTensor memory objects mapped from Core OpTensors + * + * @return Status + */ +Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors); + +/** Operator runtime to run a @ref ClWorkload + * + * @note User must explicitly call prepare before run otherwise run will fail. + * + */ +class ClCompositeOperator +{ +public: + ClCompositeOperator(); + ~ClCompositeOperator(); + /** Configures a @ref ClCompositeOperator with a @ref ClWorkload + * This includes the compilation of Cl kernels inside the @ref ClWorkload + * + * @param[in] ctx CLCompileContext + * @param[in] workload ClWorkload to configure with + */ + void configure(const CLCompileContext &ctx, const ClWorkload &workload); + /** Validate ClWorkload @p workload + * + * @param[in] workload ClWorkload to be validated + * + * @return Status + */ + static Status validate(const ClWorkload &workload); + /** Enqueue prepare workloads + * + * @param tensor_pack_map Tensors required by the prepare workloads + */ + void prepare(TensorPackMap &tensor_pack_map); + /** Enqueue run workloads + * + * @param tensor_pack_map Tensors required by the run workloads + */ + void run(TensorPackMap &tensor_pack_map); + +private: + struct Implementation; + std::unique_ptr _impl; +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H \ No newline at end of file diff --git a/docs/DoxygenLayout.xml b/docs/DoxygenLayout.xml index 69bdaf5c71..2d59dbe56c 100644 --- a/docs/DoxygenLayout.xml +++ b/docs/DoxygenLayout.xml @@ -19,7 +19,7 @@ - + diff --git a/examples/SConscript b/examples/SConscript index 8ee688e76d..d456b7246c 100644 --- a/examples/SConscript +++ b/examples/SConscript @@ -1,4 +1,4 @@ -# Copyright (c) 2017 Arm Limited. +# Copyright (c) 2017-2022 Arm Limited. # # SPDX-License-Identifier: MIT # @@ -95,6 +95,15 @@ if env['opencl']: prog = install_bin(prog) alias = examples_env.Alias(example, prog) Default(alias) + if env['experimental_dynamic_fusion']: + examples_env.Append(CPPDEFINES = ['ARM_COMPUTE_CL', 'ENABLE_EXPERIMENTAL_DYNAMIC_FUSION']) + for file in Glob("./dynamic_fusion/*.cpp"): + example = os.path.basename(os.path.splitext(str(file))[0]) + prog = examples_env.Program(example, ["./dynamic_fusion/{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs) + Depends(prog, arm_compute_dependency) + prog = install_bin(prog) + alias = examples_env.Alias(example, prog) + Default(alias) if env['gemm_tuner'] and env['opencl']: gemm_tuner_common_options = examples_env.Object("./gemm_tuner/CommonGemmExampleOptions.cpp") diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp new file mode 100644 index 0000000000..6048024d30 --- /dev/null +++ b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp @@ -0,0 +1,386 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/// @example dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp +/// @copybrief example_dynamic_fusion_cl_conv2d_elementwise_add +/// +/// @page example_dynamic_fusion_cl_conv2d_elementwise_add Dynamic Fusion Example: Conv2d + Elementwise Addition (OpenCL target) +/// This example demonstrates how to fuse a Conv2d with an Addition using the new OperatorGraph API, and to run it with the Async Composite Operator + +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */ +#error "This example needs to be built with -DARM_COMPUTE_CL" +#endif /* ARM_COMPUTE_CL */ + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/core/experimental/OperatorGraph.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTuner.h" +#include "arm_compute/runtime/experimental/ClCompositeOperator.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "utils/TypePrinter.h" + +#include "utils/Utils.h" + +#include + +using namespace arm_compute; +using namespace utils; +using namespace arm_compute::experimental::dynamic_fusion; + +#define TICK(clock_name) \ + auto clock_name##_tick = std::chrono::high_resolution_clock::now(); +#define TOCK(clock_name, measurement_map) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); +#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); + +using std::chrono::duration_cast; +using std::chrono::microseconds; + +class ClFusedConv2dEltwiseAddExample : public Example +{ +public: + bool do_setup(int argc, char **argv) override + { + size_t ih; + size_t iw; + size_t ifm; + size_t wh; + size_t ww; + size_t ofm; + size_t tuner_choice; + unsigned int pad_x; + unsigned int pad_y; + if(argc < 10) + { + // Print help + std::cout << "Usage: ./cl_fused_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n"; + std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n"; + ih = 512; + iw = 512; + ifm = 64; + wh = 1; + ww = 1; + ofm = 3; + tuner_choice = 2; + pad_x = 0; + pad_y = 0; + } + else + { + ih = strtol(argv[1], nullptr, 10); + iw = strtol(argv[2], nullptr, 10); + ifm = strtol(argv[3], nullptr, 10); + wh = strtol(argv[4], nullptr, 10); + ww = strtol(argv[5], nullptr, 10); + ofm = strtol(argv[6], nullptr, 10); + tuner_choice = strtol(argv[7], nullptr, 10); + pad_x = strtol(argv[8], nullptr, 10); + pad_y = strtol(argv[9], nullptr, 10); + } + + CLTuner *tuner_to_use; + switch(tuner_choice) + { + case 0: + { + tuner_to_use = nullptr; + break; + } + case 1: + { + tuner.set_tuner_mode(CLTunerMode::RAPID); + tuner_to_use = &tuner; + break; + } + case 3: + { + tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE); + tuner_to_use = &tuner; + break; + } + case 2: + default: + { + tuner.set_tuner_mode(CLTunerMode::NORMAL); + tuner_to_use = &tuner; + break; + } + } + CLScheduler::get().default_init(tuner_to_use); + + TICK(startup_time); + /* Computation: + * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + + const auto t_input_shape = TensorShape(ifm, iw, ih); + const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); + const auto t_bias_shape = TensorShape(ofm); + const auto t_l1_addend_shape = TensorShape(ofm, iw); + + std::cout << "input_shape: " << t_input_shape << std::endl; + std::cout << "weight_shape: " << t_weight_shape << std::endl; + std::cout << "bias_shape: " << t_bias_shape << std::endl; + std::cout << "addend_shape: " << t_l1_addend_shape << std::endl; + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// @section describe_workload_using_operator_graph Describe the workload to run using OperatorGraph + /// OperatorGraph is a graph of Tensors and Operators. Let's first default-construct it + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct OperatorGraph + // [Construct OperatorGraph] + OperatorGraph op_graph; + // [Construct OperatorGraph] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// @subsection add_conv2d Add the first operator (root operator) Conv2d + /// The first operator to be added to the graph is called the "root operator" of the entire graph. + /// @note As of now, operators need to be inserted according to their dependency order. This is because output tensor auto-initialization occurs during construction time. + /// Later this might be changed to allow out-of-order insertion. + + /// Before we insert the operator, we need to initialize the required TensorInfo objects. + /// We can choose not to initialize an output TensorInfo; if so, they will be auto-initialized during the construction of the OperatorGraph + /// The "t_acc_info" is the TensorInfo of the accumulator tensor, which is the output tensor of our first operator conv2d + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize Conv2d TensorInfo + // [Initialize Conv2d TensorInfo] + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout); + auto t_acc_info = TensorInfo(); + // [Initialize Conv2d TensorInfo] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// Next we associate the TensorInfo with the OpTensor s created in the op_graph. + /// @note The associated TensorInfo objects must be in scope and remain valid until the ClWorkload building is completed + + /// @note The associated TensorInfo objects must be declard as non-const, since they may be updated during the OperatorGraph construction + + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add OpTensors + // [Add OpTensors] + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_bias = add_tensor(op_graph, t_bias_info); + const auto op_t_acc = add_tensor(op_graph, t_acc_info); + // [Add OpTensors] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// Finally we add the Conv2d operator to op_graph. The Conv2dDescriptor contains all the TOSA-compliant attribute parameters + /// The add_op... group of functions accept the OpTensors created by the add_tensor function, and return an Operator handle. + /// This handle can be used to further query and modify the operator inside the OperatorGraph after its creation + /// For example, here we use the handle to force the ConvolutionMethod to be Direct Convolution + /// @note The force_conv2d_method is only for debug purpose for now, as the end user is not expected to decide on the ConvolutionMethod + + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Conv2d Operator + // [Add Conv2d Operator] + Conv2dDescriptor conv2d_desc{ Padding2D{ pad_x, pad_x, pad_y, pad_y } }; + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_bias, op_t_acc); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); // Only for debug purposes + // [Add Conv2d Operator] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// @subsection add_elementwise_add Add the second operator Elementwise Add + /// This is similar to adding the first operator to op_graph, except that we link the two operators together by their common tensor, + /// namely the accumulator tensor op_t_acc, which is the output of conv2d and the input (lhs) of the addition + /// @note At the moment, it is recommended to always declare a separate TensorInfo (even if empty) for each OpTensor. + /// For example, here op_t_dst could be associated with op_t_acc info as they are the same, + /// but we still recommend creating a separate object. + + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Elementwise Add Operator + // [Add Elementwise Add Operator] + auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout); + auto t_dst_info = TensorInfo(); + const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + AddDescriptor add_desc{}; + add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); + // [Add Elementwise Add Operator] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// @section build_clworkload Build ClWorkload + /// ClWorkload is an intermediate object which contains all the built kernel codes and all other descriptors on how to schedule them + /// We build ClWorkload from the op_graph object that we just described + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Build ClWorkload + // [Build ClWorkload] + const ClWorkloadContext workload_ctx + { + GpuInfo{ CLScheduler::get().target() } + }; + ClWorkload workload; + build(workload, op_graph, workload_ctx); + // [Build ClWorkload] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// @section run_fused_op_with_clcompositeoperator Run the fused operator workload with ClCompositeOperator + /// @subsection configure_and_validate_clcompositeoperator Validate ClWorkload and Configure ClCompositeOperator + /// After ClWorkload is built, we need to configure it with the Compute Library runtime ClCompositeOperator to run it. + /// Optionally we can explicitly validate the workload to check if the workload has been built successfully. + /// The validate is automatically run inside configure and would throw if it fails. + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClCompositeOperator + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Validate and configure ClCompositeOperator + // [Validate and configure ClCompositeOperator] + const auto success = ClCompositeOperator::validate(workload); // Optional + op.configure(CLKernelLibrary::get().get_compile_context(), workload); + // [Validate and configure ClCompositeOperator] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// @subsection run_clcompositeoperator Run ClCompositeOperator + /// Construct the runtime CLTensor s with backing memory + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct CLTensor objects + + /// Initialize, allocate and fill the CLTensor objects + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize, Allocate and Fill CLTensor objects + // [Initialize, Allocate and Fill CLTensor objects] + t_input.allocator()->init(t_input_info); + t_weight.allocator()->init(t_weight_info); + t_bias.allocator()->init(t_bias_info); + t_l1_addend.allocator()->init(t_dst_info); + t_dst.allocator()->init(t_dst_info); + + t_input.allocator()->allocate(); + t_weight.allocator()->allocate(); + t_bias.allocator()->allocate(); + t_l1_addend.allocator()->allocate(); + t_dst.allocator()->allocate(); + + fill_random_tensor(t_input, -1.f, 1.f); + fill_random_tensor(t_weight, -1.f, 1.f); + fill_random_tensor(t_l1_addend, -1.f, 1.f); + // [Initialize, Allocate and Fill CLTensor objects] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// The OpTensorBinding creates a mapping from the OpTensor handles that we created early to the real CLTensors + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Create OpTensorBinding + // [Create OpTensorBinding] + OpTensorBinding op_tensors({ { op_t_input, &t_input }, + { op_t_weight, &t_weight }, + { op_t_bias, &t_bias }, + { op_t_l1_addend, &t_l1_addend }, + { op_t_dst, &t_dst } + }); + // [Create OpTensorBinding] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// Bind the CLTensor objects to the prepare_pack_map and run_pack_map, which are used to prepare and run the op + /// This step additionally creates empty auxiliary CLTensor objects if any, and contain them inside a ClAuxTensorData aux_tensor_data + /// @note This step associates all the CLTensors contained in op_tensors and aux_tensor_data, with prepare_pack_map and run_pack_map + /// Make sure these CLTensors remain valid as long as the two pack_maps are still in use + + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClAuxTensorData + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct TensorPackMaps + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Bind Tensors + // [Bind Tensors] + bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, op_tensors); + // [Bind Tensors] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// Initialize and Allocate Auxiliary CLTensor objects. + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize and Allocate Auxiliary CLTensor objects + // [Initialize and Allocate Auxiliary CLTensor objects] + for(auto tensor_data : aux_tensor_data.get_tensors()) + { + tensor_data.tensor->allocator()->init(tensor_data.tensor_info); + tensor_data.tensor->allocator()->allocate(); + } + // [Initialize and Allocate Auxiliary CLTensor objects] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like + /// reshaping tensors for a more performant format. + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Prepare ClCompositeOperator + // [Prepare ClCompositeOperator] + op.prepare(prepare_pack_map); + // [Prepare ClCompositeOperator] + + /// @page example_dynamic_fusion_cl_conv2d_elementwise_add + /// At last, we run our operator + /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Run ClCompositeOperator + // [Run ClCompositeOperator] + op.run(run_pack_map); + // [Run ClCompositeOperator] + TOCK(startup_time, measurements); + return true; + } + void do_run() override + { + // Run the fused op + op.run(run_pack_map); + + // Make sure all the OpenCL jobs are done executing: + CLScheduler::get().sync(); + } + + void do_teardown() override + { + for(auto m : measurements) + { + std::cout << m.first << ": " << m.second.count() << "us" << std::endl; + } + } + +private: + // [Construct CLTensor objects] + CLTensor t_input{}; + CLTensor t_weight{}; + CLTensor t_bias{}; + CLTensor t_l1_addend{}; + CLTensor t_dst{}; + // [Construct CLTensor objects] + // [Construct ClAuxTensorData] + ClAuxTensorData aux_tensor_data{}; + // [Construct ClAuxTensorData] + // [Construct TensorPackMaps] + TensorPackMap prepare_pack_map{}; + TensorPackMap run_pack_map{}; + // [Construct TensorPackMaps] + // [Construct ClCompositeOperator] + ClCompositeOperator op{}; + // [Construct ClCompositeOperator] + CLTuner tuner{}; + std::map measurements{}; +}; + +/** Main program for sgemm test + * + * @param[in] argc Number of arguments + * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta ) + */ +int main(int argc, char **argv) +{ + return utils::run_example(argc, argv); +} + +#undef TICK +#undef TOCK +#undef TOCK_AVG \ No newline at end of file diff --git a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp new file mode 100644 index 0000000000..4f68372b49 --- /dev/null +++ b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */ +#error "This example needs to be built with -DARM_COMPUTE_CL" +#endif /* ARM_COMPUTE_CL */ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTuner.h" +#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" +#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "utils/TypePrinter.h" +#include "utils/Utils.h" + +#include + +using namespace arm_compute; +using namespace utils; + +#define TICK(clock_name) \ + auto clock_name##_tick = std::chrono::high_resolution_clock::now(); +#define TOCK(clock_name, measurement_map) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); +#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); + +using std::chrono::duration_cast; +using std::chrono::microseconds; +class ClRefConv2dEltwiseAddExample : public Example +{ +public: + bool do_setup(int argc, char **argv) override + { + size_t ih; + size_t iw; + size_t ifm; + size_t wh; + size_t ww; + size_t ofm; + size_t tuner_choice; + unsigned int pad_x; + unsigned int pad_y; + if(argc < 10) + { + // Print help + std::cout << "Usage: ./cl_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive)\n"; + std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n"; + ih = 512; + iw = 512; + ifm = 64; + wh = 1; + ww = 1; + ofm = 3; + tuner_choice = 2; + pad_x = 0; + pad_y = 0; + } + else + { + ih = strtol(argv[1], nullptr, 10); + iw = strtol(argv[2], nullptr, 10); + ifm = strtol(argv[3], nullptr, 10); + wh = strtol(argv[4], nullptr, 10); + ww = strtol(argv[5], nullptr, 10); + ofm = strtol(argv[6], nullptr, 10); + tuner_choice = strtol(argv[7], nullptr, 10); + pad_x = strtol(argv[8], nullptr, 10); + pad_y = strtol(argv[9], nullptr, 10); + } + + CLTuner *tuner_to_use; + switch(tuner_choice) + { + case 0: + { + tuner_to_use = nullptr; + break; + } + case 1: + { + tuner.set_tuner_mode(CLTunerMode::RAPID); + tuner_to_use = &tuner; + break; + } + case 3: + { + tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE); + tuner_to_use = &tuner; + break; + } + case 2: + default: + { + tuner.set_tuner_mode(CLTunerMode::NORMAL); + tuner_to_use = &tuner; + break; + } + } + + CLScheduler::get().default_init(tuner_to_use); + + TICK(startup_time); + + /* Computation: + * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const PadStrideInfo conv_info{ 1, 1, pad_x, pad_y }; + // const auto t_input_shape = TensorShape(384, 12, 12); + // const auto t_weight_shape = TensorShape(384, 1, 1, 64); + // const auto t_dst_shape = TensorShape(64, 12, 12); + const auto t_input_shape = TensorShape(ifm, iw, ih); + const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); + const auto t_dst_shape = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info); + std::cout << "input_shape: " << t_input_shape << std::endl; + std::cout << "weight_shape: " << t_weight_shape << std::endl; + std::cout << "dst_shape: " << t_dst_shape << std::endl; + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Intermediate tensor for cond3 + auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + + // Init tensors + { + t_input.allocator()->init(t_input_info); + t_weight.allocator()->init(t_weight_info); + t_l1_addend.allocator()->init(t_dst_info); + t_l0_dst.allocator()->init(t_l0_dst_info); + t_dst.allocator()->init(t_dst_info); + } + + op0.configure(&t_input, &t_weight, nullptr, &t_l0_dst, conv_info); + op1.configure(&t_l0_dst, &t_l1_addend, &t_dst, ConvertPolicy{}); + + // Construct tensors + // Allocate and fill tensors + { + t_input.allocator()->allocate(); + t_weight.allocator()->allocate(); + t_l1_addend.allocator()->allocate(); + t_l0_dst.allocator()->allocate(); + t_dst.allocator()->allocate(); + fill_random_tensor(t_input, -1.f, 1.f); + fill_random_tensor(t_weight, -1.f, 1.f); + fill_random_tensor(t_l1_addend, -1.f, 1.f); + } + // Dummy run for CLTuner + op0.run(); + op1.run(); + TOCK(startup_time, measurements); + return true; + } + void do_run() override + { + // Run the fused op + op0.run(); + op1.run(); + + // Make sure all the OpenCL jobs are done executing: + CLScheduler::get().sync(); + } + + void do_teardown() override + { + for(auto m : measurements) + { + std::cout << m.first << ": " << m.second.count() << "us" << std::endl; + } + } + +private: + CLTensor t_input{}; + CLTensor t_weight{}; + CLTensor t_l1_addend{}; + CLTensor t_l0_dst{}; + CLTensor t_dst{}; + CLDirectConvolutionLayer op0{}; + CLArithmeticAddition op1{}; + CLTuner tuner{}; + std::map measurements{}; +}; + +/** Main program for sgemm test + * + * @param[in] argc Number of arguments + * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta ) + */ +int main(int argc, char **argv) +{ + return utils::run_example(argc, argv); +} + +#undef TICK +#undef TOCK +#undef TOCK_AVG \ No newline at end of file diff --git a/filelist.json b/filelist.json index 93dfdfff6e..dc4be05f58 100644 --- a/filelist.json +++ b/filelist.json @@ -2074,10 +2074,17 @@ "dynamic_fusion": [ "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp", "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp", - "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp" + "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp", + + "src/core/experimental/dynamic_fusion/OperatorGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp", + "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp", + "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp" ] } } diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index 046679e34e..d52b105507 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -349,7 +349,7 @@ public: #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /// The execution is carried out through run_op method. But the run_op method needs to be extended to include ClExecutionDescriptor as now LWS GWS tuning will be separated from the IKernel - virtual void run_composite_op(experimental::dynamic_fusion::TensorBinding &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) + virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) { ARM_COMPUTE_UNUSED(tensors, window, queue, exec_desc); } diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp index 3e9ed060be..3d49dde5c8 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" @@ -49,69 +51,46 @@ const ClKernelBlueprint::Implementation &ClKernelBlueprint::impl() const return *_impl; } -Status add_tensor_argument(ClKernelBlueprint &kernel_blueprint, const ClTensorDescriptor &tensor_desc, ArgumentID &id) +Status add_tensor(ClKernelBlueprint &kernel_blueprint, ITensorInfo *tensor_info, ArgumentID &id, ArgumentID merge_point) { - id = kernel_blueprint.impl().add_kernel_argument(tensor_desc); + id = kernel_blueprint.impl().add_kernel_tensor(tensor_info, merge_point); return Status{}; } -Status add_tensor_intermed(ClKernelBlueprint &kernel_blueprint, ArgumentID &id) -{ - id = kernel_blueprint.impl().add_intermediate_tensor(); - return Status{}; -} - -Status add_kcomp_gemm_native(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, - const GemmNativeDescriptor &gemm_native_desc, - ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id) -{ - kernel_blueprint.impl().validate_arg_ids({ lhs_id, rhs_id, bias_id, dst_id }); - kernel_blueprint.impl().add_component( - std::make_unique( - &kernel_blueprint, - gemm_native_desc, - SharedVarLink{ lhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(lhs_id) }, - SharedVarLink{ rhs_id, SharedVarIO::Input, kernel_blueprint.impl().group(rhs_id) }, - SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }, - SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) })); - - return Status{}; -} - -Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, const EltwiseAddDescriptor &, +Status add_kcomp_eltwise_add(ClKernelBlueprint &kernel_blueprint, const ClEltwiseAddKernelDescriptor &, ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id) { kernel_blueprint.impl().add_component( std::make_unique( &kernel_blueprint, - SharedVarLink{ src0_id, SharedVarIO::Input, kernel_blueprint.impl().group(src0_id) }, - SharedVarLink{ src1_id, SharedVarIO::Input, kernel_blueprint.impl().group(src1_id) }, - SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) })); + SharedVarLink{ src0_id, SharedVarIO::Input }, + SharedVarLink{ src1_id, SharedVarIO::Input }, + SharedVarLink{ dst_id, SharedVarIO::Output })); return Status{}; } -Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID, ArgumentID &) +Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &) { return Status{}; } -Status add_kcomp_direct_conv(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, - const DirectConvolutionDescriptor &direct_conv2d_desc, - ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id) +Status add_kcomp_direct_conv2d(ClKernelBlueprint &kernel_blueprint, + const ClDirectConv2dKernelDescriptor &direct_conv2d_desc, + ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id) { kernel_blueprint.impl().add_component( std::make_unique( &kernel_blueprint, direct_conv2d_desc, - SharedVarLink{ src_id, SharedVarIO::Input, kernel_blueprint.impl().group(src_id) }, - SharedVarLink{ weight_id, SharedVarIO::Input, kernel_blueprint.impl().group(weight_id) }, - SharedVarLink{ dst_id, SharedVarIO::Output, kernel_blueprint.impl().group(dst_id) }, - SharedVarLink{ bias_id, SharedVarIO::Input, kernel_blueprint.impl().group(bias_id) })); + SharedVarLink{ src_id, SharedVarIO::Input }, + SharedVarLink{ weight_id, SharedVarIO::Input }, + SharedVarLink{ dst_id, SharedVarIO::Output }, + SharedVarLink{ bias_id, SharedVarIO::Input })); return Status{}; } -Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelComponentDescriptor &, ArgumentID src_tile, ArgumentID dst_tile, const StoreType &store_type) +Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const StoreType &store_type, ArgumentID src_tile, ArgumentID dst_tile) { switch(store_type) { @@ -119,15 +98,15 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon kernel_blueprint.impl().add_component( std::make_unique( &kernel_blueprint, - SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) }, - SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) })); + SharedVarLink{ src_tile, SharedVarIO::Input }, + SharedVarLink{ dst_tile, SharedVarIO::Output })); break; case StoreType::TStoreIndirectWidthSelect: kernel_blueprint.impl().add_component( std::make_unique( &kernel_blueprint, - SharedVarLink{ src_tile, SharedVarIO::Input, kernel_blueprint.impl().group(src_tile) }, - SharedVarLink{ dst_tile, SharedVarIO::Output, kernel_blueprint.impl().group(dst_tile) })); + SharedVarLink{ src_tile, SharedVarIO::Input }, + SharedVarLink{ dst_tile, SharedVarIO::Output })); break; default: ARM_COMPUTE_ERROR("Store mode not yet supported."); @@ -136,6 +115,11 @@ Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const ClKernelCompon return Status{}; } +Status update_merge_point(ClKernelBlueprint &bp, ArgumentID t_id, ArgumentID merge_point) +{ + return bp.impl().update_merge_point(t_id, merge_point); +} + Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info) { bp.impl().set_tile_info(tile_info); @@ -143,6 +127,7 @@ Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info) } Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &kernel_blueprint) { + kernel_blueprint.impl().finalize(); code.name = kernel_blueprint.impl().build_kernel_name(); code.code = kernel_blueprint.impl().build_code(); @@ -153,12 +138,14 @@ Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint return Status{}; } +DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint) +{ + return blueprint.impl().get_graph(); +} Status tune_static(ClExecutionDescriptor &, const ClKernelCode &) { return Status{}; } } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h index 23629f47bc..3dccdd7351 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h @@ -21,13 +21,18 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H #define ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/core/experimental/DependencyGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" namespace arm_compute { @@ -35,46 +40,9 @@ namespace experimental { namespace dynamic_fusion { -using ArgumentID = int32_t; +using ArgumentID = DependencyGraph::Id; -static constexpr ArgumentID g_arg_placeholder = -1; - -/** Verbose and explicit way to enumerate all the tensor arguments variants used by - * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed - */ -enum class TensorArgType : int -{ - Scalar, - - Vector, - - Image, - Image_Reinterpret_As_3D, - Image_Export_To_ClImage2D, - - Image_3D, // 3D Tensor represented as a 2D Image + stride_z - Image_3D_Export_To_ClImage2D, - - Tensor_3D, - Tensor_4D, - - Tensor_4D_t_Buffer, - Tensor_4D_t_Image -}; -/** Describes all the info required to add a kernel argument at run time */ -struct ClKernelArgRuntimeDescriptor -{ - ClKernelArgRuntimeDescriptor(int arg_id, TensorArgType type, bool slide_along_dimz = true) - : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz } - { - } - ~ClKernelArgRuntimeDescriptor() = default; - int arg_id{ g_arg_placeholder }; // Arg ID in the blueprint - TensorArgType tensor_arg_type{ TensorArgType::Image }; - bool slide_along_dimz{ true }; -}; - -using ClKernelArgList = std::vector; +static constexpr ArgumentID g_arg_placeholder = DependencyGraph::empty_id(); /** Intermediate representation of the final, complete kernel source. */ class ClKernelBlueprint @@ -93,145 +61,38 @@ public: }; ///// Kernel Components ///// - -/** Meta information about all Cl Kernel Components */ -struct ClKernelComponentDescriptor -{ - int32_t version{ 1 }; /**< Operator version */ -}; - -/** Component: Tensor Argument */ -struct ClTensorDescriptor -{ - ClTensorDescriptor(ITensorInfo *info) - : tensor_info(info) - { - } - - ITensorInfo *tensor_info; -}; - -Status add_tensor_argument(ClKernelBlueprint &, const ClTensorDescriptor &, ArgumentID &); -Status add_tensor_intermed(ClKernelBlueprint &, ArgumentID &); - -/** Component: Gemm Native */ -struct GemmNativeDescriptor -{ - float alpha{}; - float beta{}; - unsigned int m{}; - unsigned int n{}; - unsigned int k{}; - unsigned int depth_output_gemm3d{}; - bool reinterpret_input_as_3d{}; - bool broadcast_bias{}; - bool fp_mixed_precision{}; - bool has_pad_y{}; - int nmult_transpose1xW_width{}; - int mult_interleave4x4_height{}; - GEMMLHSMatrixInfo lhs_info{}; - GEMMRHSMatrixInfo rhs_info{}; - int32_t a_offset{}; - int32_t b_offset{}; -}; - -Status add_kcomp_gemm_native(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const GemmNativeDescriptor &, - ArgumentID lhs_id, ArgumentID rhs_id, ArgumentID bias_id, ArgumentID &dst_id); - /** Component: Eltwise Add */ -struct EltwiseAddDescriptor -{ - ConvertPolicy convert_policy{ ConvertPolicy::SATURATE }; -}; -Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const EltwiseAddDescriptor &, ArgumentID src0_id, +Status add_kcomp_eltwise_add(ClKernelBlueprint &, const ClEltwiseAddKernelDescriptor &, ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id); /** Component: Activation */ -struct ActivationDescriptor -{ -}; -Status add_kcomp_activation(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const ActivationDescriptor &, ArgumentID src_id, ArgumentID &dst_id); +Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id); /** Component: Direct Convolution **/ -struct DirectConvolutionDescriptor -{ - PadStrideInfo pad_stride_info{}; -}; -Status add_kcomp_direct_conv(ClKernelBlueprint &, const ClKernelComponentDescriptor &, const DirectConvolutionDescriptor &, - ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id); - -enum class ClippingStrategy -{ - TOP_LEFT, - TOP_RIGHT, - BOTTOM_LEFT, - BOTTOM_RIGHT, -}; +Status add_kcomp_direct_conv2d(ClKernelBlueprint &, const ClDirectConv2dKernelDescriptor &, + ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id); -/** Component: Store */ -struct TileDescriptor -{ - Size2D tile_dims{}; - Size2D boundaries{}; - ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT }; - - TileDescriptor() - { - } +Status add_kcomp_store(ClKernelBlueprint &, const StoreType &store_type, ArgumentID src_id, ArgumentID dst_id); - TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip) - : tile_dims(dims), boundaries(bound), clipping(clip) - { - } - - bool empty() const - { - return (tile_dims.area() == 0) || (boundaries.area() == 0); - } -}; - -enum class StoreType -{ - VStore, - VStorePartial, - StoreRow, - ConvertStoreRow, - StoreBlock, - ConvertStoreBlock, - StoreRowPartial, - StoreBlockPartial, - StoreBlockBoundaryAware, - StoreVectorSelect, - TStoreIndirectWidthSelect -}; - -Status add_kcomp_store(ClKernelBlueprint &, const ClKernelComponentDescriptor &, ArgumentID src_id, ArgumentID dst_id, const StoreType &store_type); +Status add_tensor(ClKernelBlueprint &, ITensorInfo *, ArgumentID &, ArgumentID merge_point = DependencyGraph::empty_id()); ///// Kernel Components ///// ///// Building ///// -/** Information required for kernel compilation. The build results of KernelBlueprint */ -struct ClKernelCode -{ - std::string name{}; /**< Kernel name */ - std::string code{}; /**< Kernel source code */ - std::string config_id{}; /**< Generated from blueprint based on complex component */ - CLBuildOptions build_options{}; /**< Kernel build options */ - Window window{}; /**< Execution window */ - ClKernelArgList arguments{}; /**< Kernel argument specficiations */ - - bool operator==(const ClKernelCode &other) const - { - return name == other.name && code == other.code && build_options == other.build_options; - } -}; +/** Update existing merge tensor @p merge_point to point to @p t_id + * + * @param t_id + * @param merge_point + * @return Status + */ +Status update_merge_point(ClKernelBlueprint &, ArgumentID t_id, ArgumentID merge_point); -/** GPU information for building the @ref ClKernelCode */ -struct GpuInfo -{ - GPUTarget target{ GPUTarget::UNKNOWN }; -}; +/** Get dependency graph + * + * @return DependencyGraph + */ +DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint); /** All information required for building the @ref ClKernelCode */ struct ClCodeBuilderContext @@ -247,12 +108,6 @@ Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint ///// Building ///// ///// Tuning ///// -struct ClExecutionDescriptor -{ - cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */ - cl::NDRange gws{}; /**< Global work-group to be used */ - bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */ -}; Status tune_static(ClExecutionDescriptor &, const ClKernelCode &); @@ -261,6 +116,4 @@ Status tune_static(ClExecutionDescriptor &, const ClKernelCode &); } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h index aa27572746..17437c285d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H @@ -36,6 +38,7 @@ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" +#include #include #include #include @@ -63,8 +66,8 @@ enum class SharedVarIO enum class SharedVarGroup { - Argument, // Parameters to a kernel function - Automatic // Automatic variables declared within the kernel body + Argument, // Parameters to a kernel function == dst or src tensors of the whole blueprint graph + Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph }; /** Specifies a shared variable link for a component. @@ -74,85 +77,151 @@ enum class SharedVarGroup */ struct SharedVarLink { - ArgumentID arg_id{ g_arg_placeholder }; - SharedVarIO io{ SharedVarIO::Input }; - SharedVarGroup group{ SharedVarGroup::Argument }; - bool is_empty() const + ArgumentID arg_id{ g_arg_placeholder }; + SharedVarIO io{ SharedVarIO::Input }; + bool is_empty() const { return arg_id == g_arg_placeholder; } }; /** A table of all the variables used in the kernel / blueprint + * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) ** + * * NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID * NOTE: the variables all have the scope of the full kernel function */ class SharedVarTable { public: + /** A fully realized SharedVarLink + */ struct SharedVar { - SharedVarGroup group; - std::string uniq_name; // Unique name, also the final variable name used in the built code - ClKernelArgRuntimeDescriptor desc; // Automatic variables can and should still be described using this struct + ArgumentID arg_id{ g_arg_placeholder }; + SharedVarIO io{ SharedVarIO::Input }; + SharedVarGroup group{ SharedVarGroup::Argument }; + std::string uniq_name{}; // Unique name, also the final variable name used in the built code + ClKernelArgDescriptor desc{}; // Automatic variables can and should still be described using this struct + bool is_empty() const + { + return arg_id == g_arg_placeholder; + } }; - using Arguments = std::vector; + class Arguments + { + public: + Arguments() = default; + void add_var(const SharedVar &var) + { + ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument); + _vars.push_back(var); + } + std::vector get_all_vars() const + { + return _vars; + } + std::vector get_src_vars() const + { + std::vector src_vars; + std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var) + { + return var.io == SharedVarIO::Input; + }); + return src_vars; + } + SharedVar get_dst_var() const + { + std::vector dst_vars; + std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var) + { + return var.io == SharedVarIO::Output; + }); + ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1); + return dst_vars.at(0); + } + + private: + std::vector _vars{}; + }; - /** @note: The order of insertion is important. There is one precondition: + /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one + * + * @note: The order of insertion is important. There is one precondition: * PRECOND: The components have been sorted topologically / is being traversed in topological order * This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred */ - SharedVar add(SharedVarLink var_link, ClKernelArgRuntimeDescriptor runtime_desc, const std::string &name = "unnamed") + void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed") { ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected"); + if(!get(var_link).is_empty()) + { + return; + } + auto var_id = _num_var; std::stringstream ss; ss << name << "_" << var_id; const auto uniq_name = ss.str(); - SharedVar var{ var_link.group, uniq_name, runtime_desc }; + SharedVar var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc }; - if(var_link.group == SharedVarGroup::Argument) + if(group == SharedVarGroup::Argument) { _arguments.emplace(var_id, var); + _arg_id_map.emplace(var_link.arg_id, var_id); _num_var++; - _var_id_lut[var_link.arg_id] = var_id; } - else if(var_link.group == SharedVarGroup::Automatic) + else if(group == SharedVarGroup::Automatic) { - if(var_link.io == SharedVarIO::Output) + if(_global_vars.empty()) { - _global_vars.emplace(var_id, var); - _num_var++; - _var_id_lut[var_link.arg_id] = var_id; + if(var_link.io == SharedVarIO::Output) + { + _global_vars.emplace(var_id, var); + _arg_id_map.emplace(var_link.arg_id, var_id); + _num_var++; + } + else + { + ARM_COMPUTE_ERROR("Component likely not traversed in topological order"); + } } else { - // For the input link, the var (and thus its arg_id) will always have been added by the time we get here if we traverse components in topological order - var = get_var(var_link.arg_id); + // Associate additional SharedVarLinks with the single global shared variable + const auto global_var_id = _global_vars.begin()->first; + _arg_id_map[var_link.arg_id] = global_var_id; } } else { ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup"); } - return var; } - SharedVar get_var(ArgumentID arg_id) const + /** Get the SharedVar associated with @p var_link + * + * @param var_link + * @return SharedVar + */ + SharedVar get(const SharedVarLink &var_link) const { - const auto var_id = _var_id_lut.at(arg_id); // arg_id has to exist in lut to begin with - auto it = _global_vars.find(var_id); - if(it != _global_vars.end()) - { - return it->second; - } - it = _arguments.find(var_id); - if(it != _arguments.end()) + const SharedVar empty_var{}; + if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end()) { - return it->second; + const auto var_id = _arg_id_map.at(var_link.arg_id); + const auto arg_var = _arguments.find(var_id); + if(arg_var != _arguments.end()) + { + return arg_var->second; + } + else + { + return _global_vars.at(var_id); + } } - ARM_COMPUTE_ERROR("Cannot find component variable"); + return empty_var; } /** @note The arguments are returned in the order they are added @@ -162,7 +231,7 @@ public: Arguments args{}; for(const auto &a : _arguments) { - args.push_back(a.second); + args.add_var(a.second); } return args; } @@ -171,9 +240,9 @@ private: using VarID = int32_t; private: - std::map _global_vars{}; - std::map _arguments{}; - std::unordered_map _var_id_lut{}; + std::map _global_vars{}; // Shared, global variable + std::map _arguments{}; + std::map _arg_id_map{}; // Track ArgumentIDs that have already been added VarID _num_var{ 0 }; }; @@ -184,7 +253,7 @@ enum class ComponentType Store }; -using ComponentID = int32_t; +using ComponentID = DependencyGraph::Id; using ComponentList = std::vector; class IClKernelComponent { @@ -224,7 +293,7 @@ public: }; using TagLUT = std::unordered_map; // Used to instantiating a code template / replacing tags public: - IClKernelComponent(const ClKernelBlueprint *blueprint) + IClKernelComponent(ClKernelBlueprint *blueprint) : _blueprint(blueprint) { } @@ -304,12 +373,18 @@ public: { return Window{}; } - /** "Allocate" all shared variables used in a component to the @p vtable, and generate a TagLUT used to instantiate the component code + /** Get the tag look-up table used to instantiate the component code. * * @param vtable * @return TagLUT */ - virtual TagLUT allocate_vars(SharedVarTable &vtable) const = 0; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0; + + /** Allocate all shared variables used by the component in the @p vtable + * + * @param vtable + */ + virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0; virtual std::string get_dst_addr_calculation() const { @@ -331,7 +406,7 @@ public: } protected: - const ClKernelBlueprint *_blueprint; + ClKernelBlueprint *_blueprint; private: ComponentID _id{}; @@ -348,18 +423,19 @@ public: ~Implementation() = default; public: - ArgumentID add_kernel_argument(const ClTensorDescriptor &tensor_desc) + Status update_merge_point(ArgumentID t_id, ArgumentID merge_point) { - _kernel_arguments.insert(std::make_pair(_num_args, tensor_desc)); - _shared_var_group_lut[_num_args] = SharedVarGroup::Argument; - return _num_args++; + return _graph.update_merge_point(t_id, merge_point); } - ArgumentID add_intermediate_tensor() + ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id()) { - _intermediate_tensors.insert(_num_args); - _shared_var_group_lut[_num_args] = SharedVarGroup::Automatic; - return _num_args++; + const auto id = _graph.add_tensor(merge_point); + if(_kernel_tensors.find(id) == _kernel_tensors.end()) + { + _kernel_tensors.insert(std::make_pair(id, tensor_info)); + } + return id; } void set_tile_info(const TileDescriptor &tile_info) @@ -382,7 +458,7 @@ public: for(const auto arg_id : args) { ARM_COMPUTE_UNUSED(arg_id); - ARM_COMPUTE_ERROR_ON_MSG(_kernel_arguments.find(arg_id) == _kernel_arguments.end() && _intermediate_tensors.find(arg_id) == _intermediate_tensors.end() && arg_id != g_arg_placeholder, + ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder, "Trying to use an argument that hasn't been added to the blueprint"); } } @@ -395,29 +471,36 @@ public: ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported."); } - // This flag specifies if the current component is the root of the component graph - // If the root is set to -1, it means that a root hasn't been added yet - bool is_graph_root = true; - // Get an unique ID for the component that's being added - const ComponentID component_id = _num_components++; + std::vector src_tensors; + std::vector dst_tensors; + for(const auto &link : component->get_links()) + { + if(link.is_empty()) + { + continue; + } + if(link.io == SharedVarIO::Input) + { + src_tensors.push_back(link.arg_id); + } + else + { + dst_tensors.push_back(link.arg_id); + } + } + const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second; component->set_id(component_id); // Add this component to the component graph. Don't connect it to anything yet _component_graph.emplace(component_id, ComponentList{}); - int32_t positional_arg = 0; - // For every { arg_id, arg_io } passed along with this component... for(const auto &link : component->get_links()) { const ArgumentID &arg_id = link.arg_id; const SharedVarIO &arg_io = link.io; - // A component is considered root only if all its input arguments are kernel arguments (or placeholders, which means nullptr) - // This performs a check on every argument, and if one of them doesn't respect the condition, the component is not considered root - is_graph_root &= (_kernel_arguments.find(arg_id) != _kernel_arguments.end()) || (arg_io == SharedVarIO::Output) || (arg_id == g_arg_placeholder); - // Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there if(_outgoing_components.find(arg_id) == _outgoing_components.end()) { @@ -454,15 +537,9 @@ public: _incoming_components[arg_id].push_back(component_id); } - - ++positional_arg; } - if(is_graph_root) - { - ARM_COMPUTE_ERROR_ON_MSG(_graph_root >= 0, "Trying to add more than one root to the graph"); - _graph_root = component_id; - } + ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph"); // Finally, add this component to the dictionary of components _components.insert(std::make_pair(component_id, std::move(component))); @@ -489,17 +566,28 @@ public: std::set additional_macros{}; std::vector component_codes{}; // vector because order matters - // Go through the components graph (topological sort) and fill the data structures above + // Step 1: Allocate all kernel argument shared variables before generating the component code auto stack = topological_sort(); while(!stack.empty()) { auto curr_component_id = stack.top(); auto &curr_component = _components.find(curr_component_id)->second; + curr_component->allocate_shared_vars(_vtable); + + stack.pop(); + } + // Step 2: Generate component codes + stack = topological_sort(); + while(!stack.empty()) + { + auto curr_component_id = stack.top(); + auto &curr_component = _components.find(curr_component_id)->second; + auto curr_headers_list = curr_component->get_headers_list(); auto curr_additional_macros = curr_component->get_additional_macros(); auto curr_component_code = curr_component->get_component_code(); - const auto var_lut = curr_component->allocate_vars(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + const auto var_lut = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut)); headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); @@ -511,7 +599,7 @@ public: stack.pop(); } - // This section assembles the data gathered by traversing the graph into the string "code" + // Step 3: Assemble the data gathered by traversing the graph into the string "code" std::string code = ""; for(auto &header : headers_list) @@ -596,34 +684,79 @@ public: ClKernelArgList get_arguments() const { ClKernelArgList arg_list{}; - for(const auto &arg_var : _vtable.get_kernel_arguments()) + for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars()) { - arg_list.push_back(arg_var.desc); + arg_list[arg_var.desc.arg_id] = arg_var.desc; } return arg_list; } - const ClTensorDescriptor *get_kernel_argument(const ArgumentID id) const + /** Get the arguments as shared vars from the vtable + * + * @return SharedVarTable::Arguments + */ + SharedVarTable::Arguments get_argument_shared_vars() const + { + return _vtable.get_kernel_arguments(); + } + + const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const { - auto it = _kernel_arguments.find(id); - if(it != _kernel_arguments.end()) + auto it = _kernel_tensors.find(id); + if(it != _kernel_tensors.end()) { - return &_kernel_arguments.find(id)->second; + return it->second; } return nullptr; } - ITensorInfo *get_kernel_argument_info(const ArgumentID id) const + ITensorInfo *get_kernel_argument_info(const ArgumentID id) { - const ClTensorDescriptor *arg_desc = get_kernel_argument(id); - if(arg_desc != nullptr) + auto it = _kernel_tensors.find(id); + if(it != _kernel_tensors.end()) { - return arg_desc->tensor_info; + return it->second; } return nullptr; } + /** Finalize graph construction. Graph is expected to not mutate after being finalized + */ + void finalize() + { + cache_root_component(); + assign_shared_var_group(); + } + + DependencyGraph get_graph() const + { + return _graph; + } private: + void cache_root_component() + { + const auto roots = _graph.get_root_ops(); + ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph"); + _graph_root = roots.at(0); + } + /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building + */ + void assign_shared_var_group() + { + for(const auto &tensor : _kernel_tensors) + { + const auto tensor_id = tensor.first; + if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id)) + { + _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument; + } + else + { + _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic; + } + } + } + void topological_sort_utility(ComponentID component_id, std::unordered_set &visited, std::stack &stack) const { visited.insert(component_id); @@ -666,41 +799,41 @@ private: std::string code; switch(var.desc.tensor_arg_type) { - case TensorArgType::Vector: + case ClKernelTensorArgType::Vector: { code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")"; break; } - case TensorArgType::Image: + case ClKernelTensorArgType::Image: { code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")"; break; } - case TensorArgType::Image_3D: + case ClKernelTensorArgType::Image_3D: { code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),"; code += "\n uint " + var.uniq_name + "_stride_z"; break; } - case TensorArgType::Image_3D_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: { code += "\n __read_only image2d_t " + var.uniq_name + "_img,"; code += "\n uint " + var.uniq_name + "_stride_z"; break; } - case TensorArgType::Tensor_4D_t_Buffer: + case ClKernelTensorArgType::Tensor_4D_t_Buffer: { code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)"; break; } - case TensorArgType::Tensor_4D_t_Image: + case ClKernelTensorArgType::Tensor_4D_t_Image: { code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)"; break; } default: { - ARM_COMPUTE_ERROR("Unsupported declaration generation for TensorArgType"); + ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType"); } } return code; @@ -710,7 +843,7 @@ private: { std::string code = "\n__kernel void " + build_kernel_name() + "("; - for(const auto &arg : argument_list) + for(const auto &arg : argument_list.get_all_vars()) { code += generate_argument_declaration(arg) + ","; } @@ -722,54 +855,55 @@ private: std::string generate_global_section() const { - std::string code = ""; - code += " uint g_x = get_global_id(0);\n"; - code += " uint g_y = get_global_id(1);\n"; - code += " uint g_z = get_global_id(2);\n\n"; + auto dst_info = get_kernel_argument_info(_dst_id); + auto dst_w = dst_info->dimension(0); + auto dst_h = dst_info->dimension(1); + const auto tile_w = std::max(1, get_execution_window().x().step()); + const auto tile_h = std::max(1, get_execution_window().y().step()); + auto leftover_w = dst_w % tile_w; + auto leftover_h = dst_h % tile_h; - size_t tile_dim_x = _tile_info.empty() ? 1 : _tile_info.tile_dims.x(); - size_t tile_dim_y = _tile_info.empty() ? 1 : _tile_info.tile_dims.y(); + std::string code = ""; + code += std::string(" int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + std::to_string(leftover_h) + ");\n"; + code += std::string(" int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n"); switch(_tile_info.clipping) { case ClippingStrategy::TOP_LEFT: - code += " const bool g_cond_x = (g_x == 0);\n"; - code += " const bool g_cond_y = (g_y == 0);\n"; + code += " const bool g_cond_x = (cout == 0);\n"; + code += " const bool g_cond_y = (mout == 0);\n"; break; case ClippingStrategy::TOP_RIGHT: - code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; - code += " const bool g_cond_y = (g_y == 0);\n"; + code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; + code += " const bool g_cond_y = (mout == 0);\n"; break; case ClippingStrategy::BOTTOM_LEFT: - code += " const bool g_cond_x = (g_x == 0);\n"; - code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; + code += " const bool g_cond_x = (cout == 0);\n"; + code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; break; case ClippingStrategy::BOTTOM_RIGHT: - code += " const bool g_cond_x = ((g_x + 1) * " + std::to_string(tile_dim_x) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; - code += " const bool g_cond_y = ((g_y + 1) * " + std::to_string(tile_dim_y) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; + code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; + code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; break; default: ARM_COMPUTE_ERROR("Unsupported clipping strategy"); } - code += "\n REPEAT_VAR_INIT_TO_CONST(" + std::to_string(tile_dim_y) + ", uint, g_zout, 0);\n"; - code += " REPEAT_VAR_INIT_TO_CONST(16, uint, g_zero, 0);\n\n"; - return code; } TileDescriptor _tile_info{}; - int32_t _num_args{}; - int32_t _num_components{}; int32_t _num_complex_components{}; ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers - // Argument, components and intermediate tensors IDs with corresponding ptrs (except intermediate) + DependencyGraph _graph{}; + + // Tensors, components and IDs with corresponding ptrs (except intermediate) std::unordered_map _components{}; - std::unordered_map _kernel_arguments{}; - std::unordered_set _intermediate_tensors{}; + std::unordered_map _kernel_tensors{}; // Argument group lookup. Can be replaced by extending the ArgumentID type to include group info std::unordered_map _shared_var_group_lut{}; @@ -794,6 +928,4 @@ private: } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h index 41ab4e320b..d4feac7da9 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS @@ -72,6 +74,4 @@ inline std::string to_string(const ClKernelCode &code) } // namespace experimental } // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp index f951ce3d46..11fb1d53d0 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" @@ -31,6 +33,7 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" namespace arm_compute { namespace experimental @@ -44,7 +47,7 @@ ComponentType ClDirectConvolutionKernelComponent::get_component_type() const std::set ClDirectConvolutionKernelComponent::get_headers_list() const { - return std::set { "helpers.h", "tile_helpers.h", "repeat.h" }; + return std::set { "helpers.h", "tile_helpers.h" }; } Window ClDirectConvolutionKernelComponent::get_window() const @@ -54,7 +57,17 @@ Window ClDirectConvolutionKernelComponent::get_window() const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, _desc.pad_stride_info); + PadStrideInfo pad_stride_info + { + static_cast(_desc.conv2d.stride.x()), + static_cast(_desc.conv2d.stride.y()), + static_cast(_desc.conv2d.pad.left), + static_cast(_desc.conv2d.pad.right), + static_cast(_desc.conv2d.pad.top), + static_cast(_desc.conv2d.pad.bottom), + DimensionRoundingType::FLOOR /*default rounding type*/ + }; + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info); // Output auto initialization if not yet initialized auto_init_if_empty(*dst_info, output_shape, @@ -64,6 +77,9 @@ Window ClDirectConvolutionKernelComponent::get_window() const const unsigned int vec_size = std::min(static_cast(dst_info->tensor_shape()[0]), 4u); const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U; + // const unsigned int num_rows = 1; + // const unsigned int vec_size = tile_info.tile_dims.x(); + // const unsigned int num_rows = tile_info.tile_dims.y(); // Create and configure kernel window Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); @@ -95,27 +111,30 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const //------------------ START KERNEL {{meta_kernel_id}} --------------------- // IN_0(src) {{src}} // IN_1(wei) {{weight}} + )_"; + if(bias_info != nullptr) + { + code += R"_( // IN_1(bia) {{bias}} + )_"; + } + code += R"_( // OUT(dst, accum) {{dst}} - const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM - const int mout = GET_SPATIAL_IDX(1, M0, 0); // WIDTH x HEIGHT - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - // Initialize the accumulators TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); { // All the tensor dimensions are passed at compile time. // In case of dynamic tensor support, the following dimensions should be passed as function argument. - #define _I{{WEI_WIDTH}} {{WEI_WIDTH}} - #define _I{{WEI_HEIGHT}} {{WEI_HEIGHT}} + #define _IWEI_WIDTH {{WEI_WIDTH}} + #define _IWEI_HEIGHT {{WEI_HEIGHT}} #define _ISRC_WIDTH {{src}}_w #define _ISRC_HEIGHT {{src}}_h #define _ISRC_CHANNELS {{src}}_c - #define _IDST_WIDTH {{dst_w}} - #define _IDST_HEIGHT {{dst_h}} - #define _IDST_CHANNELS {{dst_c}} - #define _IY_MULTIPLIER (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}) + #define _IDST_WIDTH {{arg_dst}}_w + #define _IDST_HEIGHT {{arg_dst}}_h + #define _IDST_CHANNELS {{arg_dst}}_c + #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // .v = access the whole vector (OpenCL vector) // .s[x] = access the vector element at position x (scalar access) @@ -136,13 +155,11 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const {{dst}}[i].v = 0; }) - uint cond = (get_global_id(0) == 0) && (get_global_id(1) == 0) && (get_global_id(2) == 0); - - for(int i = 0; i < (_I{{WEI_WIDTH}} * _I{{WEI_HEIGHT}}); ++i) + for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { int ck = 0; - int xk = i % _I{{WEI_WIDTH}}; - int yk = i / _I{{WEI_WIDTH}}; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_HEIGHT; int k = 0; for(; k <= (_ISRC_CHANNELS - K0); k += K0) @@ -201,6 +218,16 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const } code += R"_( + #undef _I_WEI_WIDTH + #undef _I_WEI_HEIGHT + #undef _ISRC_WIDTH + #undef _ISRC_HEIGHT + #undef _ISRC_CHANNELS + #undef _IDST_WIDTH + #undef _IDST_HEIGHT + #undef _IDST_CHANNELS + #undef _IY_MULTIPLIER + } )_"; @@ -217,44 +244,7 @@ std::string ClDirectConvolutionKernelComponent::get_component_code() const } code += R"_( - #undef _I{{WEI_WIDTH}} - #undef _I{{WEI_HEIGHT}} - #undef _ISRC_WIDTH - #undef _ISRC_HEIGHT - #undef _ISRC_CHANNELS - #undef _IDST_WIDTH - #undef _IDST_HEIGHT - #undef _IDST_CHANNELS - #undef _IY_MULTIPLIER } - - // Workaround for the discrepancy between tiles and repeats - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}0 = {{dst}}[0].v; -#if M0 >= 2 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}1 = {{dst}}[1].v; -#endif // M0 >= 2 -#if M0 >= 3 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}2 = {{dst}}[2].v; -#endif // M0 >= 3 -#if M0 >= 4 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}3 = {{dst}}[3].v; -#endif // M0 >= 4 -#if M0 >= 8 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}4 = {{dst}}[4].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}5 = {{dst}}[5].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}6 = {{dst}}[6].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}7 = {{dst}}[7].v; -#endif // M0 >= 8 -#if M0 == 16 - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}8 = {{dst}}[8].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}9 = {{dst}}[9].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}A = {{dst}}[10].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}B = {{dst}}[11].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}C = {{dst}}[12].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}D = {{dst}}[13].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}E = {{dst}}[14].v; - VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0) {{dst}}F = {{dst}}[15].v; -#endif // M0 == 16 //------------------ END KERNEL {{meta_kernel_id}} --------------------- )_"; return code.c_str(); @@ -306,19 +296,18 @@ bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const { const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + // const auto tile_info = _blueprint->impl().get_tile_info(); const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); const DataType data_type = src_info->data_type(); - const GPUTarget gpu_target = ICLKernel().get_target(); - - Window win = get_window(); + const GPUTarget gpu_target = CLScheduler::get().target(); - const unsigned int n0 = win.x().step(); - const unsigned int m0 = win.y().step(); + const unsigned int n0 = _blueprint->impl().get_execution_window().x().step(); + const unsigned int m0 = _blueprint->impl().get_execution_window().y().step(); const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst_info->dimension(channel_idx) % n0; + const unsigned int partial_store_n0 = dst_info->dimension(0) % n0; const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); // Update the padding for the weights tensor if we can export to cl_image @@ -338,54 +327,79 @@ CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() cons return build_opts; } -ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); + const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); + + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); + + const GPUTarget gpu_target = CLScheduler::get().target(); + const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); + const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer; + vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight"); + + if(!_bias.is_empty()) // optional bias + { + vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias"); + } + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); +} + +ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { TagLUT lut{}; const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - const GPUTarget gpu_target = ICLKernel().get_target(); - const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); - const TensorArgType weight_type = export_to_cl_image ? TensorArgType::Tensor_4D_t_Image : TensorArgType::Tensor_4D_t_Buffer; - lut["meta_kernel_id"] = id(); - lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Tensor_4D_t_Buffer), "src"); - lut["weight"] = vtable.add(_weight, ClKernelArgRuntimeDescriptor(_weight.arg_id, weight_type), "weight"); + // Arguments and global shared variables + lut["src"] = vtable.get(_src); + lut["weight"] = vtable.get(_weight); if(!_bias.is_empty()) // optional bias { - lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Vector), "bias"); + lut["bias"] = vtable.get(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type()); } - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); - - // Local build options - const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); - const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); - const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); + lut["dst"] = vtable.get(_dst); - lut["dst_w"] = dst_info->dimension(width_idx); - lut["dst_h"] = dst_info->dimension(height_idx); - lut["dst_c"] = dst_info->dimension(channel_idx); + const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var(); + lut["arg_dst"] = dst_argument.uniq_name; - lut["ACC_DATA_TYPE"] = src_info->data_type(); - lut["SRC_DATA_TYPE"] = src_info->data_type(); - lut["WEI_DATA_TYPE"] = weight_info->data_type(); + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = src_info->data_type(); + lut["SRC_DATA_TYPE"] = src_info->data_type(); + lut["WEI_DATA_TYPE"] = weight_info->data_type(); lut["SRC_TENSOR_TYPE"] = "BUFFER"; - lut["WEI_TENSOR_TYPE"] = export_to_cl_image ? "IMAGE" : "BUFFER"; - - lut["WEI_WIDTH"] = weight_info->dimension(width_idx); - lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); + switch(vtable.get(_weight).desc.tensor_arg_type) + { + case ClKernelTensorArgType::Image_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: + case ClKernelTensorArgType::Tensor_4D_t_Image: + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } + default: + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + } + const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); + const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); + lut["WEI_WIDTH"] = weight_info->dimension(width_idx); + lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); - lut["STRIDE_X"] = std::get<0>(_desc.pad_stride_info.stride()); - lut["STRIDE_Y"] = std::get<1>(_desc.pad_stride_info.stride()); + lut["STRIDE_X"] = _desc.conv2d.stride.x(); + lut["STRIDE_Y"] = _desc.conv2d.stride.y(); - lut["PAD_LEFT"] = _desc.pad_stride_info.pad_left(); - lut["PAD_TOP"] = _desc.pad_stride_info.pad_top(); + lut["PAD_LEFT"] = _desc.conv2d.pad.left; + lut["PAD_TOP"] = _desc.conv2d.pad.top; lut["ZERO_VALUE"] = 0; @@ -393,6 +407,4 @@ ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::a } } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h index 10c0e00a58..af9a65debc 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H @@ -39,7 +41,7 @@ namespace dynamic_fusion class ClDirectConvolutionKernelComponent : public IClKernelComponent { public: - ClDirectConvolutionKernelComponent(const ClKernelBlueprint *blueprint, const DirectConvolutionDescriptor &desc, + ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc, const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{}) : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst } { @@ -58,7 +60,8 @@ public: return { _src, _weight, _bias, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + virtual void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::string name() const override { @@ -66,16 +69,14 @@ public: } private: - DirectConvolutionDescriptor _desc{}; - Link _src{}; - Link _weight{}; - Link _bias{}; - Link _dst{}; + ClDirectConv2dKernelDescriptor _desc{}; + Link _src{}; + Link _weight{}; + Link _bias{}; + Link _dst{}; }; } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp index 84e4003d5d..2bbea8725d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h" #include "arm_compute/core/Validate.h" @@ -41,7 +43,7 @@ ComponentType ClElementwiseAddKernelComponent::get_component_type() const std::set ClElementwiseAddKernelComponent::get_headers_list() const { - return std::set { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "gemm_helpers.h", "repeat.h", "tile_helpers.h" }; + return std::set { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" }; } Window ClElementwiseAddKernelComponent::get_window() const @@ -67,63 +69,62 @@ Window ClElementwiseAddKernelComponent::get_window() const std::string ClElementwiseAddKernelComponent::get_component_code() const { std::string code; - return R"_( + const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; + + if(is_root) + { + return R"_( //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- - // IN_0(Accumulator) {{acc}} - // IN_1(Addend) {{addend}} + // IN_0(LHS) {{lhs}} + // IN_1(RHS) {{rhs}} + // OUT(dst, accum) {{dst}} - // c = addend + c (mix-precision, broadcast, boundary aware) + // dst = lhs + rhs (mix-precision, broadcast, boundary aware) + TILE({{DATA_TYPE}}, M0, N0, {{dst}}); { - __global uchar *addend_addr = {{addend}}_ptr + {{addend}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{addend}}_stride_y) + get_global_id(2) * {{addend}}_stride_z; \ - LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, addend, addend_addr, 0, {{addend}}_stride_y, g_zero, PARTIAL_LOAD_M0, PARTIAL_LOAD_N0, PARTIAL_COND_Y, PARTIAL_COND_X); \ - MIXED_PRECISION_ELTWISE_OP_BLOCK(ADD_X_POS_0, M0, N0, {{acc}}, addend, DATA_TYPE_ACCUMULATOR, addend_hp); - } + TILE({{DATA_TYPE}}, M0, N0, lhs_tile); + TILE({{DATA_TYPE}}, M0, N0, rhs_tile); - // Workaround for the discrepancy between tiles and repeats -#if defined(IS_TILED) - {{acc}}[0].v = {{acc}}0; -#if M0 >= 2 - {{acc}}[1].v = {{acc}}1; -#endif // M0 >= 2 -#if M0 >= 3 - {{acc}}[2].v = {{acc}}2; -#endif // M0 >= 3 -#if M0 >= 4 - {{acc}}[3].v = {{acc}}3; -#endif // M0 >= 4 -#if M0 >= 8 - {{acc}}[4].v = {{acc}}4; - {{acc}}[5].v = {{acc}}5; - {{acc}}[6].v = {{acc}}6; - {{acc}}[7].v = {{acc}}7; -#endif // M0 >= 8 -#if M0 == 16 - {{acc}}[8].v = {{acc}}8; - {{acc}}[9].v = {{acc}}9; - {{acc}}[10].v = {{acc}}A; - {{acc}}[11].v = {{acc}}B; - {{acc}}[12].v = {{acc}}C; - {{acc}}[13].v = {{acc}}D; - {{acc}}[14].v = {{acc}}E; - {{acc}}[15].v = {{acc}}F; -#endif // M0 == 16 -#endif // defined(IS_TILED) + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile); + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{rhs}}, cout, mout, 1, {{rhs}}_stride_y, rhs_tile); + + T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); + } //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- +)_"; + } + else + { + return R"_( + //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- + // IN_0/Out(Accumulator) {{acc}} + // IN_1(Addend) {{addend}} + // acc = addend + acc (mix-precision, broadcast, boundary aware) + { + TILE({{DATA_TYPE}}, M0, N0, addend_tile); + + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{addend}}, cout, mout, 1, {{addend}}_stride_y, addend_tile); + + T_ADD_BROADCAST_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}}); + } + //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_ADD --------------------- )_"; + } } CLBuildOptions ClElementwiseAddKernelComponent::generate_build_options() const { - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - auto tile_info = _blueprint->impl().get_tile_info(); + const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); CLBuildOptions build_opts{}; + const auto n0 = _blueprint->impl().get_execution_window().x().step(); + const auto m0 = _blueprint->impl().get_execution_window().y().step(); + const auto partial_m0 = t_dst_info->dimension(1) % m0; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); - build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0)); return build_opts; } @@ -142,34 +143,56 @@ std::string ClElementwiseAddKernelComponent::generate_config_id() const return config_id; } -ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClElementwiseAddKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const { - // Determine which argument is the accumulator - Link accumulator; - Link addend; - if(_lhs.group == SharedVarGroup::Automatic) + const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; + vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs"); + vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs"); + if(is_root) { - accumulator = _lhs; - addend = _rhs; + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); } - else if(_rhs.group == SharedVarGroup::Automatic) +} + +ClElementwiseAddKernelComponent::TagLUT ClElementwiseAddKernelComponent::get_tag_lut(const SharedVarTable &vtable) const +{ + TagLUT lut{}; + const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); + // Arguments and global shared variables + const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; + if(is_root) { - accumulator = _rhs; - addend = _lhs; + lut["lhs"] = vtable.get(_lhs); + lut["rhs"] = vtable.get(_rhs); + lut["dst"] = vtable.get(_dst); } else { - ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + // Determine which link is the accumulator + Link accumulator; + Link addend; + if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic) + { + accumulator = _lhs; + addend = _rhs; + } + else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic) + { + accumulator = _rhs; + addend = _lhs; + } + else + { + ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + } + lut["acc"] = vtable.get(accumulator); + lut["addend"] = vtable.get(addend); } - return { - { "meta_kernel_id", id() }, - { "acc", vtable.add(accumulator, ClKernelArgRuntimeDescriptor(accumulator.arg_id, TensorArgType::Image_3D), "add_acc") }, - { "addend", vtable.add(addend, ClKernelArgRuntimeDescriptor(addend.arg_id, TensorArgType::Image_3D), "add_addend") }, - // {"dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst")}, // dst is needed for the root version and/or non-inplace version should we need one - }; + // Local build options + lut["meta_kernel_id"] = id(); + lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type()); + return lut; } } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h index 35c9538b8d..4f7b69724d 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H @@ -37,7 +39,7 @@ namespace dynamic_fusion class ClElementwiseAddKernelComponent : public IClKernelComponent { public: - ClElementwiseAddKernelComponent(const ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst) + ClElementwiseAddKernelComponent(ClKernelBlueprint *blueprint, const Link &lhs, const Link &rhs, const Link &dst) : IClKernelComponent(blueprint), _lhs{ lhs }, _rhs{ rhs }, _dst{ dst } { } @@ -54,7 +56,8 @@ public: return { _lhs, _rhs, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + virtual void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::string name() const override { @@ -70,6 +73,4 @@ private: } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp deleted file mode 100644 index 45b81b424d..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.cpp +++ /dev/null @@ -1,555 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" -#include "arm_compute/core/TensorInfo.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/utils/helpers/float_ops.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ComponentType ClGemmNativeKernelComponent::get_component_type() const -{ - return ComponentType::Complex; -} - -std::set ClGemmNativeKernelComponent::get_headers_list() const -{ - return std::set { "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/fp_post_ops_act_eltwise_op_act.h", "gemm_helpers.h", "repeat.h" }; -} - -Window ClGemmNativeKernelComponent::get_window() const -{ - ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - ITensorInfo *bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info); - - bool reinterpret_input_as_3d = _desc.reinterpret_input_as_3d; - bool reinterpret_output_as_3d = _desc.depth_output_gemm3d != 0; - - Window win{}; - Window win_out{}; - bool window_changed = false; - - // In case both input and dst have to be reinterpreted as 3D tensors, - // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) - { - reinterpret_output_as_3d = false; - } - - // activation_layer is set to dummy because it's required by GEMMKernelInfo, but it's not used in shape calculation - GEMMKernelInfo gemm_info(_desc.m, _desc.n, _desc.k, _desc.depth_output_gemm3d, _desc.reinterpret_input_as_3d, - _desc.broadcast_bias, _desc.fp_mixed_precision, _desc.has_pad_y, ActivationLayerInfo(), _desc.nmult_transpose1xW_width, - _desc.mult_interleave4x4_height, _desc.lhs_info, _desc.rhs_info, _desc.a_offset, _desc.b_offset); - - // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst_info, lhs_info->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*lhs_info, *rhs_info, gemm_info))); - - TensorInfo tmp_info(*dst_info); - - if(reinterpret_output_as_3d) - { - // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, - // the window needs to be constructed on the 2D collapsed version of the tensor - TensorShape tmp_shape(dst_info->tensor_shape()); - tmp_shape.collapse(2U, 1U); - tmp_info.set_tensor_shape(tmp_shape); - } - - win = calculate_max_window(tmp_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); - win_out = calculate_max_window(*dst_info, Steps(_desc.rhs_info.n0, _desc.lhs_info.m0)); - - AccessWindowStatic src0_access(lhs_info, 0, 0, - lhs_info->dimension(0), - lhs_info->dimension(1)); - AccessWindowStatic src1_access(rhs_info, 0, 0, - ceil_to_multiple(rhs_info->dimension(0), _desc.rhs_info.n0), - rhs_info->dimension(1)); - AccessWindowStatic dst_access(dst_info, 0, 0, - dst_info->dimension(0), - dst_info->dimension(1)); - - if(bias_info != nullptr) - { - const int bias_processed_per_iteration_x = _desc.rhs_info.n0; - - AccessWindowStatic src2_access(bias_info, 0, 0, - ceil_to_multiple(bias_info->dimension(0), bias_processed_per_iteration_x), - bias_info->dimension(1)); - - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - else - { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor - } - - // Collapse along the Z direction - // This collapse needs to be here in order to tune the Z dimension of LWS - Window collapsed = win; - const unsigned int dimension_to_collapse = std::min(static_cast(dst_info->num_dimensions()), 2u); - collapsed = win.collapse(win, dimension_to_collapse); - - if(window_changed == true) - { - ARM_COMPUTE_ERROR("Insufficient Padding!"); - } - - return collapsed; -} - -std::string ClGemmNativeKernelComponent::get_additional_macros() const -{ - return R"_( -#define VFMA(a, b, c) \ -({ \ - c = fma(a, b, c); \ -}) - -#if M0 == 1 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - }) -#elif M0 == 2 // M0 == 2 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - }) -#elif M0 == 3 // M0 == 3 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - }) -#elif M0 == 4 // M0 == 4 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - }) -#elif M0 == 5 // M0 == 5 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - }) -#elif M0 == 6 // M0 == 6 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - }) -#elif M0 == 7 // M0 == 7 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - }) -#elif M0 == 8 // M0 == 8 -#define RHS_VFMA_M0xN0(i, a, b, c) \ - ({ \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \ - VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \ - }) -#else // M0 not supported -#error "M0 not supported" -#endif // M0 not supported -)_"; -} - -std::string ClGemmNativeKernelComponent::get_component_code() const -{ - auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - - auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); - auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; - auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); - - std::string code = R"_( - //------------------ START KERNEL {{meta_kernel_id}} --------------------- - // IN_0(lhs) {{lhs}} - // IN_1(rhs) {{rhs}} - )_"; - - if(!_bias.is_empty()) - { - code += R"_( - // IN_2(bias) {{bias}} - )_"; - } - - code += R"_( - // OUT(dst, accum) {{dst}} - - // Initialize the accumulators - REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), {{dst}}, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0; - { -#if defined(DUMMY_WORK_ITEMS) - if((g_x * N0 >= N) || (g_y * M0 >= M)) - { - return; - } -#endif // defined(DUMMY_WORK_ITEMS) - - // Compute LHS matrix address - uint lhs_offset = {{lhs}}_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * (uint){{lhs}}_stride_y; - - // Compute RHS matrix address - uint rhs_offset = {{rhs}}_offset_first_element_in_bytes + g_x * N0 * sizeof(DATA_TYPE); - )_"; - - if(dont_slide_b) - { - code += R"_( - // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 - rhs_offset += (g_z % {{MATRIX_B_DEPTH}}) * {{rhs}}_stride_z; - )_"; - } - else - { - code += R"_( - rhs_offset += g_z * {{rhs}}_stride_z; - )_"; - } - - code += R"_( - REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); - )_"; - - if(reinterpret_input_as_3d) - { - code += R"_( - // The plane (zlhs) is calculated dividing M (g_y * M0) by HEIGHT_GEMM3D - CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0), {{HEIGHT_GEMM3D}}, {{DEPTH_GEMM3D}}, {{lhs}}_cross_plane_pad, {{lhs}}_stride_y); - - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply lhs_stride_z by DEPTH_GEMM3D - lhs_offset += g_z * {{lhs}}_stride_z * {{DEPTH_GEMM3D}}; - )_"; - } - else - { - code += R"_( - // Add offset for batched GEMM - lhs_offset += g_z * {{lhs}}_stride_z; - )_"; - } - - code += R"_( - int i = 0; -#if {{K0}} > 1 - for(; i <= (K - {{K0}}); i += {{K0}}) - { - // Supported cases (M0, K0): - // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 - // 2,2 - 2,3 - 2,4 - 2,8 - 2,16 - // 3,2 - 3,3 - 3,4 - 3,8 - 3,16 - // 4,2 - 4,3 - 4,4 - 4,8 - 4,16 - // 5,2 - 5,3 - 5,4 - 5,8 - 5,16 - // 6,2 - 6,3 - 6,4 - 6,8 - 6,16 - // 7,2 - 7,3 - 7,4 - 7,8 - 7,16 - // 8,2 - 8,3 - 8,4 - 8,8 - 8,16 - // Load values from LHS matrix - LOAD_BLOCK(M0, {{K0}}, DATA_TYPE, a, {{lhs}}_ptr, lhs_offset, {{lhs}}_stride_y, zlhs); - - // Load values from RHS matrix - LOAD_BLOCK({{K0}}, N0, DATA_TYPE, b, {{rhs}}_ptr, rhs_offset, {{rhs}}_stride_y, g_zero); - - RHS_VFMA_M0xN0(0, a, b0, {{dst}}); - RHS_VFMA_M0xN0(1, a, b1, {{dst}}); -#if {{K0}} > 2 - RHS_VFMA_M0xN0(2, a, b2, {{dst}}); -#endif // K0 > 2 -#if {{K0}} > 3 - RHS_VFMA_M0xN0(3, a, b3, {{dst}}); -#endif // K0 > 3 -#if {{K0}} > 4 - RHS_VFMA_M0xN0(4, a, b4, {{dst}}); - RHS_VFMA_M0xN0(5, a, b5, {{dst}}); - RHS_VFMA_M0xN0(6, a, b6, {{dst}}); - RHS_VFMA_M0xN0(7, a, b7, {{dst}}); -#endif // K0 > 4 -#if {{K0}} > 8 - RHS_VFMA_M0xN0(8, a, b8, {{dst}}); - RHS_VFMA_M0xN0(9, a, b9, {{dst}}); - RHS_VFMA_M0xN0(A, a, bA, {{dst}}); - RHS_VFMA_M0xN0(B, a, bB, {{dst}}); - RHS_VFMA_M0xN0(C, a, bC, {{dst}}); - RHS_VFMA_M0xN0(D, a, bD, {{dst}}); - RHS_VFMA_M0xN0(E, a, bE, {{dst}}); - RHS_VFMA_M0xN0(F, a, bF, {{dst}}); -#endif // K0 > 8 - - lhs_offset += {{K0}} * sizeof(DATA_TYPE); - rhs_offset += {{K0}} * {{rhs}}_stride_y; - } -#endif // K0 > 1 - // Left-over accumulations - for(; i < K; ++i) - { - // Load values from LHS matrix - VEC_DATA_TYPE(DATA_TYPE, 2) - a0 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 0 * {{lhs}}_stride_y + zlhs0)); -#if M0 > 1 - VEC_DATA_TYPE(DATA_TYPE, 2) - a1 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 1 * {{lhs}}_stride_y + zlhs1)); -#endif // M0 > 1 -#if M0 > 2 - VEC_DATA_TYPE(DATA_TYPE, 2) - a2 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 2 * {{lhs}}_stride_y + zlhs2)); -#endif // M0 > 2 -#if M0 > 3 - VEC_DATA_TYPE(DATA_TYPE, 2) - a3 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 3 * {{lhs}}_stride_y + zlhs3)); -#endif // M0 > 3 -#if M0 > 4 - VEC_DATA_TYPE(DATA_TYPE, 2) - a4 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 4 * {{lhs}}_stride_y + zlhs4)); -#endif // M0 > 4 -#if M0 > 5 - VEC_DATA_TYPE(DATA_TYPE, 2) - a5 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 5 * {{lhs}}_stride_y + zlhs5)); -#endif // M0 > 5 -#if M0 > 6 - VEC_DATA_TYPE(DATA_TYPE, 2) - a6 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 6 * {{lhs}}_stride_y + zlhs6)); -#endif // M0 > 6 -#if M0 > 7 - VEC_DATA_TYPE(DATA_TYPE, 2) - a7 = *((__global DATA_TYPE *)({{lhs}}_ptr + lhs_offset + 7 * {{lhs}}_stride_y + zlhs7)); -#endif // M0 > 7 - - VEC_DATA_TYPE(DATA_TYPE, N0) - b = VLOAD(N0)(0, (__global DATA_TYPE *)({{rhs}}_ptr + rhs_offset + 0 * {{rhs}}_stride_y)); - RHS_VFMA_M0xN0(0, a, b, {{dst}}); - - lhs_offset += sizeof(DATA_TYPE); - rhs_offset += {{rhs}}_stride_y; - } - - // Multiply by the weight of matrix-matrix product and store the result - )_"; - if(has_alpha) - { - code += R"_( - SCALE_BLOCK(M0, DATA_TYPE, {{dst}}, {{ALPHA}}); - )_"; - } - - if(!_bias.is_empty()) - { - if(_desc.broadcast_bias) - { - code += R"_( - // Add beta*bias - __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); - - LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); - )_"; - - if(helpers::float_ops::is_one(_desc.beta)) - { - code += R"_( - SCALE_BLOCK(1, DATA_TYPE, bias, {{BETA}}); - )_"; - } - - code += R"_( - // c = c + bias[broadcasted] - ADD_BLOCK_BROADCAST(M0, {{dst}}, bias0); - )_"; - } - else - { - code += R"_( - // Add beta*bias - __global uchar *bias_addr = {{bias}}_ptr + {{bias}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, - PARTIAL_STORE_M0) - * {{bias}}_stride_y) - + g_z * {{bias}}_stride_z; - - LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, {{bias}}_stride_y, g_zero); - )_"; - - if(helpers::float_ops::is_one(_desc.beta)) - { - code += R"_( - SCALE_BLOCK(M0, DATA_TYPE, bias, {{BETA}}); - )_"; - } - - code += R"_( - // c = c + bias - ADD_BLOCK(M0, {{dst}}, bias); - )_"; - } - } - - code += R"_( - } - //------------------ END KERNEL {{meta_kernel_id}} --------------------- - )_"; - return code.c_str(); -} - -CLBuildOptions ClGemmNativeKernelComponent::generate_build_options() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - auto tile_info = _blueprint->impl().get_tile_info(); - - CLBuildOptions build_opts{}; - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM=" + support::cpp11::to_string(tile_info.boundaries.y())); - build_opts.add_option("-DN=" + support::cpp11::to_string(tile_info.boundaries.x())); - build_opts.add_option("-DK=" + support::cpp11::to_string(_desc.k)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); - build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); - - return build_opts; -} - -std::string ClGemmNativeKernelComponent::generate_config_id() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - std::string config_id{}; - config_id += (_bias.is_empty() ? "add_bias_" : ""); - config_id += (_desc.broadcast_bias ? "broadcast_bias_" : ""); - config_id += (_desc.reinterpret_input_as_3d ? "3di_" : ""); - config_id += (_desc.depth_output_gemm3d > 0 ? "3do_" : ""); - config_id += lower_string(string_from_data_type(t_dst_info->data_type())); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(1)); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(0)); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.k); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(2)); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.lhs_info.m0); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.rhs_info.n0); - config_id += "_"; - config_id += support::cpp11::to_string(_desc.rhs_info.k0); - return config_id; -} - -ClGemmNativeKernelComponent::TagLUT ClGemmNativeKernelComponent::allocate_vars(SharedVarTable &vtable) const -{ - TagLUT lut{}; - - lut["meta_kernel_id"] = id(); - lut["lhs"] = vtable.add(_lhs, ClKernelArgRuntimeDescriptor(_lhs.arg_id, TensorArgType::Image_3D), "lhs"); - lut["rhs"] = vtable.add(_rhs, ClKernelArgRuntimeDescriptor(_rhs.arg_id, TensorArgType::Image_3D), "rhs"); - if(!_bias.is_empty()) // optional bias - { - lut["bias"] = vtable.add(_bias, ClKernelArgRuntimeDescriptor(_bias.arg_id, TensorArgType::Image_3D), "bias"); - } - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst"); - - // Local build options - auto t_lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - auto has_alpha = !(helpers::float_ops::is_one(_desc.alpha)); - auto has_beta = _blueprint->impl().get_kernel_argument_info(_bias.arg_id) != nullptr; - auto reinterpret_input_as_3d = _desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d == 0; - auto reinterpret_output_as_3d = !_desc.reinterpret_input_as_3d && _desc.depth_output_gemm3d != 0; - auto dont_slide_b = t_rhs_info->num_dimensions() < t_lhs_info->num_dimensions(); - - lut["K0"] = support::cpp11::to_string(_desc.rhs_info.k0); - - if(has_alpha) - { - lut["ALPHA"] = float_to_string_with_full_precision(_desc.alpha); - } - if(has_beta) - { - lut["BETA"] = float_to_string_with_full_precision(_desc.beta); - } - if(dont_slide_b) - { - lut["MATRIX_B_DEPTH"] = support::cpp11::to_string(t_rhs_info->dimension(2)); - } - - if(reinterpret_output_as_3d) - { - lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(1)); - lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_dst_info->dimension(2)); - } - else if(reinterpret_input_as_3d) - { - lut["HEIGHT_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(1)); - lut["DEPTH_GEMM3D"] = support::cpp11::to_string(t_lhs_info->dimension(2)); - } - - return lut; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h deleted file mode 100644 index b282856b56..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H - -#include "arm_compute/core/Steps.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" -#include "src/core/helpers/AutoConfiguration.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -class ClGemmNativeKernelComponent : public IClKernelComponent -{ -public: - ClGemmNativeKernelComponent(const ClKernelBlueprint *blueprint, const GemmNativeDescriptor &desc, - const Link &lhs, const Link &rhs, const Link &dst, const Link &bias = Link{}) - : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _bias{ bias }, _dst{ dst } - { - } - - ComponentType get_component_type() const override; - std::set get_headers_list() const override; - std::string get_additional_macros() const override; - std::string get_component_code() const override; - Window get_window() const override; - ClKernelArgList get_args(); - CLBuildOptions generate_build_options() const override; - std::string generate_config_id() const override; - - virtual std::vector get_links() const override - { - return { _lhs, _rhs, _bias, _dst }; - } - - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; - - virtual std::string name() const override - { - return "gemm_mm_native_" + std::to_string(id()); - } - -private: - GemmNativeDescriptor _desc{}; - Link _lhs{}; - Link _rhs{}; - Link _bias{}; - Link _dst{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLGEMMNATIVEKERNELCOMPONENT_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h index de02f948e9..c6716a0a23 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h @@ -21,16 +21,15 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseAddKernelComponent.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClGemmNativeKernelComponent.h" #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp index 5f023ba528..e0b210f4ed 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" @@ -65,25 +67,36 @@ std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const { auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - auto tile_info = _blueprint->impl().get_tile_info(); + // auto tile_info = _blueprint->impl().get_tile_info(); CLBuildOptions build_opts{}; + const auto n0 = _blueprint->impl().get_execution_window().x().step(); + const auto m0 = _blueprint->impl().get_execution_window().y().step(); + const auto partial_m0 = t_dst_info->dimension(0) % m0; + const auto partial_n0 = t_dst_info->dimension(1) % n0; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(tile_info.tile_dims.y())); - build_opts.add_option("-DN0=" + support::cpp11::to_string(tile_info.tile_dims.x())); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(tile_info.boundaries.y() % tile_info.tile_dims.y())); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(tile_info.boundaries.x() % tile_info.tile_dims.x())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0)); return build_opts; } -ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src"); + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst"); +} + +ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { return { { "meta_kernel_id", id() }, - { "src", vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src") }, - { "dst", vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Image_3D), "dst") }, + { "src", vtable.get(_src) }, + { "dst", vtable.get(_dst) }, }; } @@ -96,19 +109,26 @@ std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() cons { return R"_( //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- + { + #define _IDST_WIDTH {{dst}}_w + #define _IDST_HEIGHT {{dst}}_h + TILE(uint, M0, 1, dst_indirect_y); - TILE(uint, M0, 1, dst_indirect_y); + // Calculate the destination indirect Y + LOOP_UNROLLING(int, i, 0, 1, M0, + { + dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); + dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(mout + i, (int)({{dst_w}} * {{dst_h}}) - 1); - dst_indirect_y[i].v += bout * (int)({{dst_w}} * {{dst_h}}); - }) + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, PARTIAL_N0 != 0 && g_cond_x, {{src}}, dst_indirect_y); + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); - //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + #undef _IDST_WIDTH + #undef _IDST_HEIGHT + //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + } )_"; } @@ -120,21 +140,24 @@ CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options return build_opts; } -ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::allocate_vars(SharedVarTable &vtable) const +void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const +{ + vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); + vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); +} + +ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const { TagLUT lut{}; - lut["meta_kernel_id"] = id(); - lut["src"] = vtable.add(_src, ClKernelArgRuntimeDescriptor(_src.arg_id, TensorArgType::Image_3D), "src"); - lut["dst"] = vtable.add(_dst, ClKernelArgRuntimeDescriptor(_dst.arg_id, TensorArgType::Tensor_4D_t_Buffer), "dst"); + // Arguments and global shared variables + lut["src"] = vtable.get(_src); + lut["dst"] = vtable.get(_dst); // Local build options - auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - lut["dst_w"] = dst_info->dimension(1); - lut["dst_h"] = dst_info->dimension(2); - + lut["meta_kernel_id"] = id(); lut["DST_TENSOR_TYPE"] = "BUFFER"; + const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); lut["DST_DATA_TYPE"] = dst_info->data_type(); return lut; @@ -142,6 +165,4 @@ ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKern } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h index c7da8bd3e8..26883d7fa0 100644 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h +++ b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H @@ -37,21 +39,21 @@ namespace dynamic_fusion class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent { public: - ClStoreBlockBoundaryAwareKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst) + ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } { } ComponentType get_component_type() const override; std::string get_component_code() const override; CLBuildOptions generate_build_options() const override; + TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::vector get_links() const override { return { _src, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; - virtual std::string name() const override { return ""; @@ -65,21 +67,21 @@ private: class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent { public: - ClStoreIndirectWidthSelectKernelComponent(const ClKernelBlueprint *blueprint, const Link &src, const Link &dst) + ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } { } ComponentType get_component_type() const override; std::string get_component_code() const override; CLBuildOptions generate_build_options() const override; + virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; + void allocate_shared_vars(SharedVarTable &vtable) const override; virtual std::vector get_links() const override { return { _src, _dst }; } - virtual TagLUT allocate_vars(SharedVarTable &vtable) const override; - virtual std::string name() const override { return ""; @@ -93,6 +95,4 @@ private: } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp new file mode 100644 index 0000000000..5dbf2f660d --- /dev/null +++ b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/OperatorGraph.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" +#include "src/core/helpers/AutoConfiguration.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void check_dependency_graph_op_success(OperatorGraph &graph, const Status &status) +{ + if(!bool(status)) + { + graph.impl()->status = Status{ status.error_code(), "Cycles or loops are not allowed" }; + } +} + +// Check if there are more than one roots in the graph +void check_multiple_roots(OperatorGraph &graph) +{ + if(graph.impl()->graph.get_root_ops().size() > 1) + { + graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Multiple roots are not allowed" }; + } +} + +void check_execution_shape(OperatorGraph &graph, const ITensorInfo &dst_info) +{ + const auto roots = graph.impl()->graph.get_root_ops(); + for(auto root : roots) + { + // We assume exactly 1 dst tensor for all operators + const auto root_info = graph.impl()->tensors[graph.impl()->graph.dst_tensors(root)[0]]->get_tensor_info(); + for(unsigned int dim = 0; dim < root_info->num_dimensions(); ++dim) + { + if(root_info->dimension(dim) != dst_info.dimension(dim)) + { + graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot change execution space" }; + return; + } + } + } +} +} // namespace + +OpTensor::OpTensor(Id id) + : _id{ id } +{ +} + +OpTensor::Id OpTensor::id() const +{ + return _id; +} + +bool operator<(const OpTensor &t0, const OpTensor &t1) +{ + return t0.id() < t1.id(); +} + +Operator::Operator(Id id) + : _id{ id } +{ +} + +Operator::Id Operator::id() const +{ + return _id; +} + +bool operator<(const Operator &op0, const Operator &op1) +{ + return op0.id() < op1.id(); +} + +OperatorGraph::OperatorGraph() + : _impl{ std::make_unique() } +{ +} + +OperatorGraph::~OperatorGraph() = default; + +OperatorGraph::Implementation *OperatorGraph::impl() +{ + return _impl.get(); +} + +const OperatorGraph::Implementation *OperatorGraph::impl() const +{ + return _impl.get(); +} + +Status validate(const OperatorGraph &graph) +{ + return graph.impl()->status; +} + +OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info) +{ + auto id = graph.impl()->graph.add_tensor(); + OpTensor op_tensor(id); + graph.impl()->add_tensor(id, &info); + return op_tensor; +} + +Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst) +{ + // Check if map is empty as a complex operator can only be root + if(!graph.impl()->graph.get_root_ops().empty()) + { + graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot add multiple complex operators" }; + return Operator{}; + } + + std::pair status_id; + + if(bias.id() == -1) + { + status_id = graph.impl()->graph.add_operator({ input.id(), weights.id() }, { dst.id() }); + } + else + { + status_id = graph.impl()->graph.add_operator({ input.id(), weights.id(), bias.id() }, { dst.id() }); + } + + check_dependency_graph_op_success(graph, status_id.first); + + Operator op_node(status_id.second); + + // Infer TensorInfo + OpTensorContent *dst_tensor = graph.impl()->tensors[dst.id()].get(); + if(dst_tensor->get_tensor_info()->total_size() == 0) + { + auto src = graph.impl()->tensors[input.id()]->get_tensor_info(); + auto wts = graph.impl()->tensors[weights.id()]->get_tensor_info(); + auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wts->tensor_shape(), PadStrideInfo(desc.stride.x(), desc.stride.y(), desc.pad.left, + desc.pad.right, + desc.pad.top, desc.pad.bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + + auto_init_if_empty(*(dst_tensor->get_tensor_info()), src->clone()->set_tensor_shape(shape)); + } + + // Check execution space + auto dst_info = dst_tensor->get_tensor_info(); + check_execution_shape(graph, *dst_info); + + ITensorDescPack tensors; + tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[input.id()].get()); + tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[weights.id()].get()); + if(bias.id() != -1) + { + tensors.add_const_tensor(ACL_SRC_2, graph.impl()->tensors[bias.id()].get()); + } + tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get()); + + graph.impl()->add_node(status_id.second, desc, tensors); + check_multiple_roots(graph); + + return op_node; +} + +Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst) +{ + return add_op_conv2d(graph, desc, input, weights, OpTensor(-1), dst); +} + +void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method) +{ + auto node = utils::cast::polymorphic_downcast(graph.impl()->operators[conv2d.id()].get()); + node->set_method(method); +} + +Operator add_op_elementwise_add(OperatorGraph &graph, const AddDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst) +{ + auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() }); + check_dependency_graph_op_success(graph, id.first); + + Operator op_node(id.second); + + // Infer TensorInfo + auto node_lhs = graph.impl()->tensors[lhs.id()]->get_tensor_info(); + auto node_rhs = graph.impl()->tensors[rhs.id()]->get_tensor_info(); + OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get(); + + if(node_dst->get_tensor_info()->total_size() == 0) + { + const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*node_rhs, *node_lhs); + auto_init_if_empty(*(node_dst->get_tensor_info()), node_lhs->clone()->set_tensor_shape(broadcast_pair.first)); + } + + // Check execution space + auto dst_info = node_dst->get_tensor_info(); + check_execution_shape(graph, *dst_info); + + ITensorDescPack tensors; + tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get()); + tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get()); + tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get()); + graph.impl()->add_node(id.second, desc, tensors); + check_multiple_roots(graph); + + return op_node; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp new file mode 100644 index 0000000000..7e9f6b870a --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::vector> get_combinations(const std::vector &sorted_fgs) +{ + ARM_COMPUTE_ERROR_ON(sorted_fgs.size() <= 1); + std::vector> combo; + for(size_t i = 0; i < sorted_fgs.size() - 1; ++i) + { + for(size_t j = i + 1; j < sorted_fgs.size(); ++j) + { + combo.push_back(std::make_pair(sorted_fgs.at(i), sorted_fgs.at(j))); + } + } + return combo; +} +} // namespace +std::vector traverse(const ClKernelFusionGroup &group) +{ + std::vector kernels; + const auto sorted = group.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(group.fused_kernels.at(pack.op)); + } + return kernels; +} + +std::vector traverse(const ClFusedKernelGraph &graph) +{ + std::vector kernels; + const auto sorted = graph.fg_dependency.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.fusion_groups.at(pack.op).get()); + } + return kernels; +} + +std::vector traverse(ClFusedKernelGraph &graph) +{ + std::vector kernels; + const auto sorted = graph.fg_dependency.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.fusion_groups.at(pack.op).get()); + } + return kernels; +} + +std::pair init_fusion_graph(const ClKernelGraph &kernel_graph) +{ + ClFusedKernelGraph fused_kernel_graph{}; + fused_kernel_graph.original_graph = &kernel_graph; // Create a copy of the original kernel graph + fused_kernel_graph.fg_dependency = DependencyGraph(); + // Initialize all fusion groups + for(const auto &kernel : traverse(kernel_graph)) + { + fused_kernel_graph.add_fusion_group({ kernel }); + } + return { Status{}, fused_kernel_graph }; +} + +Status fuse(ClFusedKernelGraph &fused_kernel_graph) +{ + // A naive fusion algorithm that's guaranteed to find optimal pattern if there are no branches + // If there are branches, the algorithm cannot guanrantee optimality as it doesn't perform any searches + + bool fusion_found = false; + do + { + fusion_found = false; + const auto sorted_fgs = traverse(fused_kernel_graph); + if(sorted_fgs.size() <= 1) + { + // Only one or zero fusion group, thus no need to perform fusion + return Status{}; + } + auto fgs_combo = get_combinations(sorted_fgs); + for(auto fgs : fgs_combo) + { + auto fg0 = fgs.first; + auto fg1 = fgs.second; + const auto st = fused_kernel_graph.can_fuse(*fg0, *fg1); + if(bool(st)) + { + const auto st = fused_kernel_graph.fuse(*fg0, *fg1); + if(!bool(st)) + { + return st; + } + fusion_found = true; + break; + } + } + } + while(fusion_found); + return Status{}; +} +Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg) +{ + Status st{}; + for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg.id)) + { + const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id); + + /// NOTE: dst tensor must have already been added to the blueprint at this point + ArgumentID dst_id; + st = add_tensor(bp, dst_t->desc, dst_id, dst_t->id); + if(!bool(st)) + { + return st; + } + /// NOTE: the extra dst tensor is needed as the store kcomp requires 2 tensors. But this is irrelevant to the fused kernel graph + /// since both tensors share the exact same info and kernel arg descriptor + ArgumentID dst_dst_id; + st = add_tensor(bp, dst_t->desc, dst_dst_id); + if(!bool(st)) + { + return st; + } + /// NOTE: Update the merge point map to link dst_dst_id with dst_t->id instead. + /// This is required because the get_arguments() returned by the blueprint returns the dst tensor added by the store component + st = update_merge_point(bp, dst_dst_id, dst_t->id); + if(!bool(st)) + { + return st; + } + st = add_kcomp_store(bp, fg.get_root_kernel()->config().store_type, dst_id, dst_dst_id); + if(!bool(st)) + { + return st; + } + } + return st; +} + +Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph) +{ + workload.context = ctx; + for(const auto &fg : traverse(fused_kernel_graph)) + { + ClKernelBlueprint bp{}; + for(const auto &kernel : traverse(*fg)) + { + const auto st = kernel->generate(bp); + if(!bool(st)) + { + return st; + } + } + auto st = set_tile_info(bp, fg->get_root_kernel()->config().tile_desc); + if(!bool(st)) + { + return st; + } + st = generate_store(bp, fused_kernel_graph, *fg); + if(!bool(st)) + { + return st; + } + + ClKernelCode code{}; + st = build(code, ClCodeBuilderContext{ ctx.gpu_info }, bp); + if(!bool(st)) + { + return st; + } + const auto bp_graph = get_dependency_graph(bp); + + // Get tensor info + std::vector workload_src_tensors{}; + for(const auto &src_t_id : fused_kernel_graph.fg_dependency.src_tensors(fg->id)) + { + const auto src_t = fused_kernel_graph.original_graph->get_tensor(src_t_id); + // Get corresponding kernel arg descriptor + const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(src_t->id)); + const auto kernel_t_id = workload.add_workload_tensor(src_t->desc, src_t->memory_type, src_t->memory_info, arg_desc, src_t->id); + workload_src_tensors.push_back(kernel_t_id); + } + std::vector workload_dst_tensors{}; + for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg->id)) + { + const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id); + // Get corresponding kernel arg descriptor + const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(dst_t->id)); + const auto kernel_t_id = workload.add_workload_tensor(dst_t->desc, dst_t->memory_type, dst_t->memory_info, arg_desc, dst_t->id); + workload_dst_tensors.push_back(kernel_t_id); + } + + workload.add_unit_workload(fg->get_root_kernel()->config().stage, code, workload_src_tensors, workload_dst_tensors); + } + + return Status{}; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h new file mode 100644 index 0000000000..4bd3cd9d8b --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/experimental/DependencyGraph.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" +#include "support/DeepCopy.h" + +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct ClKernelFusionGroup; + +/** A const view of a subgraph of the @ref ClKernelGraph to be fused together + * + */ +struct ClKernelFusionGroup +{ +public: + using Id = DependencyGraph::Id; + + ClKernelFusionGroup() = default; + ClKernelFusionGroup(Id id) + : id{ id }, graph{}, fused_kernels{}, tensors{} + { + } + ~ClKernelFusionGroup() = default; + + void set_id(Id i) + { + id = i; + } + + Id add_fused_kernel(const ClKernel *kernel) + { + /// PRE: Acyclicity ensured by DependencyGraph + /// PRE: Connectedness ensured by DependencyGraph + /// PRE: Single-rootedness ensured by User + std::vector src_tensors; + for(const auto t : kernel->tensors().get_const_src_tensors()) + { + auto id = graph.add_tensor(t->id); + if(tensors.find(id) == tensors.end()) + { + tensors[id] = t; + } + src_tensors.push_back(id); + } + std::vector dst_tensors; + for(const auto t : kernel->tensors().get_const_dst_tensors()) + { + auto id = graph.add_tensor(t->id); + if(tensors.find(id) == tensors.end()) + { + tensors[id] = t; + } + dst_tensors.push_back(id); + } + auto id = graph.add_operator(src_tensors, dst_tensors); + fused_kernels[id.second] = kernel; + return id.second; + } + + const ClKernel *get_root_kernel() const + { + auto root_kernels = graph.get_root_ops(); + ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1); + return fused_kernels.at(root_kernels.at(0)); + } + + std::vector get_src_tensors() const + { + std::vector src_tensors; + for(auto tensor_id : graph.src_tensors()) + { + src_tensors.push_back(tensors.at(tensor_id)); + } + return src_tensors; + } + + std::vector get_dst_tensors() const + { + std::vector dst_tensors; + for(auto tensor_id : graph.dst_tensors()) + { + dst_tensors.push_back(tensors.at(tensor_id)); + } + return dst_tensors; + } + + friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) + { + return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors; + } + + Id id{}; + DependencyGraph graph{}; // A subgraph of the original ClKernelGraph + std::map fused_kernels{}; + std::map tensors{}; +}; + +std::vector traverse(const ClKernelFusionGroup &group); + +struct ClFusedKernelGraph +{ +public: + using Id = DependencyGraph::Id; + + using KernelFusionGroupMap = std::map>; + + ClFusedKernelGraph() = default; + ~ClFusedKernelGraph() = default; + ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default; + ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default; + ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default; + ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default; + + friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1) + { + /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison. + /// The fusion groups can already guarantee the equivalence of fusion + /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s + return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups; + } + + Id add_fusion_group(const std::vector &fused_kernels) + { + auto fg = utils::memory::make_deep_unique(); + for(const auto k : fused_kernels) + { + fg->add_fused_kernel(k); + } + const auto src_tensors = fg->get_src_tensors(); + const auto dst_tensors = fg->get_dst_tensors(); + std::vector inputs{}; + std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel) + { + return fg_dependency.add_tensor(kernel->id); + }); + std::vector outputs{}; + std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel) + { + return fg_dependency.add_tensor(kernel->id); + }); + const auto id = fg_dependency.add_operator(inputs, outputs); + fg->set_id(id.second); + fusion_groups[id.second] = std::move(fg); + return id.second; + } + + Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1) + { + /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold + ClKernelFusionGroup *fg_src{}; + ClKernelFusionGroup *fg_dst{}; + // Find fg_src (parent / root) and fg_dst (child / non-root) + if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) + { + fg_src = &fg0; + fg_dst = &fg1; + } + else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) + { + fg_src = &fg1; + fg_dst = &fg0; + } + else + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; + } + + for(const auto &t : fg_dependency.src_tensors(fg_dst->id)) + { + if(!is_in(t, fg_dependency.dst_tensors(fg_src->id))) + { + // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src + + // Before: + // fg_src + // | + // .. t1 + // | | + // -> fg_dst <- + // + // After: + // fg_src <---t1 + // + const auto st = link_src_tensors(fg_src->id, { t }); + if(!bool(st)) + { + return st; + } + } + else + { + const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t); + if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id) + { + // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst + // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3)) + + // Before: + // fg_src + // | + // t0 + // | + // -> fg_dst + // + // After: + // fg_src + // + const auto st = remove_fg_tensor(t); + if(!bool(st)) + { + return st; + } + } + else + { + // If the tensors ARE in between fg_src and fg_dst + // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src + + // Before: + // fg_src + // | + // t0 + // | + // |----------- + // | | + // -> fg_dst -> fg_other + // + // After: + // fg_src + // | + // t0 + // | + // -> fg_other + // + + // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an + // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory + // so that any unfused fg's (fg_other in this case) can read it. + // So all this means that we not only can STORE the tensors at the "end" of a fusion group, + // but also any other tensors that are not source tensors. And all tensors that are STORED (exported), + // can be termed "dst tensors" to a fusion group + void(); + } + } + } + + for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) + { + // Link any outgoing tensors of fg_dst to fg_src + + // Before: + // fg_src + // | + // .. + // | + // -> fg_dst + // | + // |-------- + // | | + // |-> t0 |-> t1 + // + // After: + // fg_src + // | + // |-------- + // | | + // |-> t0 |-> t1 + // + const auto st = link_dst_tensors(fg_src->id, { t }); + if(!bool(st)) + { + return st; + } + } + + // Merge fg_dst's graph into fg_src's graph + for(const auto kernel : traverse(*fg_dst)) + { + fg_src->add_fused_kernel(kernel); + } + + const auto st = remove_fg(fg_dst->id); + return st; + } + Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const + { + /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel + /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels) + /// Note that this does not apply to fusion groups + /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's + /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group + /// INV0: All Fusion groups have a single root + /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph + /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph + /// INV3: All non-roots are Simple kernels + /// INV4: All non roots' dst tensors have the same shape as that of the root kernel + /// INV5: All kernels within a fusion group have the same UnitWorkloadStage + const ClKernelFusionGroup *fg_src {}; + const ClKernelFusionGroup *fg_dst{}; + + // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other + // This guarantess INV0 + // This also finds fg_src (parent / root) and fg_dst (child / non-root) + if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) + { + fg_src = &fg0; + fg_dst = &fg1; + } + else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) + { + fg_src = &fg1; + fg_dst = &fg0; + } + else + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; + } + + // Find unconnected tensors between fg_src and fg_dst + std::vector unconnected_tensors{}; + for(const auto &t : fg_dependency.dst_tensors(fg_src->id)) + { + if(!is_in(t, fg_dependency.src_tensors(fg_dst->id))) + { + unconnected_tensors.push_back(t); + } + } + + // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst + // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups + for(const auto &t : unconnected_tensors) + { + if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id)) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" }; + } + } + + // Check 2: All non-root fgs are simple. Ensure INV3 + if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" }; + } + + // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4 + const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id); + ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor) + const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc; + + for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) + { + const auto t_info = original_graph->get_tensor(t)->desc; + if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0)) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" }; + } + } + + // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5 + if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage)) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" }; + } + + return Status{}; + } + + const ClKernelGraph *original_graph{}; + DependencyGraph fg_dependency{}; + KernelFusionGroupMap fusion_groups{}; + // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups. + +private: + Status link_src_tensors(Id fg, const std::vector &src_tensors) + { + for(auto t : src_tensors) + { + fg_dependency.link_input(fg, t); + } + return Status{}; + } + Status link_dst_tensors(Id fg, const std::vector &dst_tensors) + { + for(auto t : dst_tensors) + { + fg_dependency.link_output(fg, t); + } + return Status{}; + } + Status remove_fg(Id fg) + { + fg_dependency.remove_operator(fg); + fusion_groups.erase(fg); + return Status{}; + } + Status remove_fg_tensor(Id tensor) + { + fg_dependency.remove_tensor(tensor); + return Status{}; + } +}; + +std::vector traverse(const ClFusedKernelGraph &graph); +std::vector traverse(ClFusedKernelGraph &graph); + +std::pair init_fusion_graph(const ClKernelGraph &kernel_graph); + +Status fuse(ClFusedKernelGraph &fused_kernel_graph); + +Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg); + +Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h new file mode 100644 index 0000000000..cdd2b2e552 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H + +#include "arm_compute/core/experimental/OperatorGraph.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct ClDirectConv2dKernelDescriptor +{ + friend bool operator==(const ClDirectConv2dKernelDescriptor &desc0, const ClDirectConv2dKernelDescriptor &desc1) + { + return desc0.conv2d == desc1.conv2d; + } + Conv2dDescriptor conv2d{}; +}; + +struct ClEltwiseAddKernelDescriptor +{ + friend bool operator==(const ClEltwiseAddKernelDescriptor &desc0, const ClEltwiseAddKernelDescriptor &desc1) + { + return desc0.add == desc1.add; + } + AddDescriptor add{}; +}; +struct ClActivationKernelDescriptor +{ + friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &) + { + return true; + } +}; + +enum class ClippingStrategy +{ + TOP_LEFT, + TOP_RIGHT, + BOTTOM_LEFT, + BOTTOM_RIGHT, +}; +/** Component: Store */ +struct TileDescriptor +{ + Size2D tile_dims{}; + Size2D boundaries{}; + ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT }; + + TileDescriptor() + { + } + + TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip) + : tile_dims(dims), boundaries(bound), clipping(clip) + { + } + + bool empty() const + { + return (tile_dims.area() == 0) || (boundaries.area() == 0); + } + friend bool operator==(const TileDescriptor &tile0, const TileDescriptor &tile1) + { + return tile0.tile_dims == tile1.tile_dims && tile0.boundaries == tile1.boundaries && tile0.clipping == tile1.clipping; + } +}; +enum class StoreType +{ + VStore, + VStorePartial, + StoreRow, + ConvertStoreRow, + StoreBlock, + ConvertStoreBlock, + StoreRowPartial, + StoreBlockPartial, + StoreBlockBoundaryAware, + StoreVectorSelect, + TStoreIndirectWidthSelect +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp new file mode 100644 index 0000000000..8aaf0946bb --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" + +#include "support/Cast.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status ClDirectConv2dKernel::generate(ClKernelBlueprint &bp) const +{ + const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst); + ArgumentID input_id; + add_tensor(bp, input->desc, input_id, input->id); + ArgumentID weight_id; + add_tensor(bp, weight->desc, weight_id, weight->id); + ArgumentID bias_id = g_arg_placeholder; + if(bias != nullptr) + { + add_tensor(bp, bias->desc, bias_id, bias->id); + } + ArgumentID dst_id; + add_tensor(bp, dst->desc, dst_id, dst->id); + + add_kcomp_direct_conv2d(bp, desc, input_id, weight_id, bias_id, dst_id); + return Status{}; +} +Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc) +{ + // 1. Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, biases); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // weights shape is correct + const DataLayout data_layout = src->data_layout(); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + + // dst shape is correct + PadStrideInfo legacy_pad_stride(conv2d_desc.conv2d.stride.x(), conv2d_desc.conv2d.stride.y(), conv2d_desc.conv2d.pad.left, conv2d_desc.conv2d.pad.right, conv2d_desc.conv2d.pad.top, + conv2d_desc.conv2d.pad.bottom, DimensionRoundingType{}); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride)); + + // biases shape is correct + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, + "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + return Status{}; +} + +bool ClDirectConv2dKernel::operator==(const ClKernel &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast(&other); + return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; +} + +Status ClAddKernel::generate(ClKernelBlueprint &bp) const +{ + const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ArgumentID lhs_id; + add_tensor(bp, lhs->desc, lhs_id, lhs->id); + ArgumentID rhs_id; + add_tensor(bp, rhs->desc, rhs_id, rhs->id); + ArgumentID dst_id; + add_tensor(bp, dst->desc, dst_id, dst->id); + + add_kcomp_eltwise_add(bp, desc, lhs_id, rhs_id, dst_id); + return Status{}; +} + +Status ClAddKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst) +{ + // 1. Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); + + const bool in_place = (lhs == dst) || (rhs == dst); + const bool src0_in_place = in_place && (lhs == dst); + + // dst shape is correct + const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src0_in_place ? lhs->tensor_shape() : rhs->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + + // 2. Check support level + + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); + + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); + + return Status{}; +} + +bool ClAddKernel::operator==(const ClKernel &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast(&other); + return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; +} + +std::vector traverse(const ClKernelGraph &graph) +{ + std::vector kernels; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.kernels.at(pack.op).get()); + } + return kernels; +} +std::vector traverse(ClKernelGraph &graph) +{ + std::vector kernels; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + kernels.push_back(graph.kernels.at(pack.op).get()); + } + return kernels; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h new file mode 100644 index 0000000000..1e14afb266 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/core/experimental/DependencyGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h" +#include "support/DeepCopy.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +struct ClKernelGraph; +class ClKernelBlueprint; + +enum class Complexity +{ + Simple, + Complex +}; + +/** Configurations for ClKernel + * + */ +struct ClKernelConfig +{ + UnitWorkloadStage stage{}; + TileDescriptor tile_desc{}; + StoreType store_type{}; + friend bool operator==(const ClKernelConfig &config0, const ClKernelConfig &config1) + { + return config0.stage == config1.stage && config0.tile_desc == config1.tile_desc && config0.store_type == config1.store_type; + } +}; + +struct ClKernelTensor +{ +public: + using Id = DependencyGraph::Id; + ClKernelTensor() = default; + ClKernelTensor(Id id, ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info) + : id{ id }, desc{ desc }, memory_type{ memory_type }, memory_info{ memory_info } + { + } + bool operator==(const ClKernelTensor &other) const + { + return desc == other.desc; + } + + Id id{}; + ITensorInfo *desc{}; + MemoryType memory_type{}; + AuxMemoryInfo memory_info{}; +}; + +struct ClKernel +{ +public: + using Id = DependencyGraph::Id; + ClKernel() = default; + virtual ~ClKernel() = default; + ClKernel(const ClKernel &kernel) = default; + ClKernel &operator=(const ClKernel &kernel) = default; + ClKernel(ClKernel &&kernel) = default; + ClKernel &operator=(ClKernel &&kernel) = default; + ClKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ITensorDescPack &tensors) + : _graph{ graph }, _id{ id }, _config{ config }, _tensors{ tensors } + { + } + virtual bool operator==(const ClKernel &other) const = 0; + virtual Complexity complexity() const = 0; + virtual Status generate(ClKernelBlueprint &bp) const = 0; + Id id() const + { + return _id; + } + ITensorDescPack tensors() const + { + return _tensors; + } + ClKernelConfig config() const + { + return _config; + } + +protected: + const ClKernelGraph *_graph {}; + Id _id{}; + ClKernelConfig _config{}; + ITensorDescPack _tensors{}; +}; + +struct ClDirectConv2dKernel : public ClKernel +{ +public: + Complexity complexity() const override + { + return Complexity::Complex; + } + ClDirectConv2dKernel() = default; + ~ClDirectConv2dKernel() override = default; + ClDirectConv2dKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig config, const ClDirectConv2dKernelDescriptor &desc, const ITensorDescPack tensors) + : ClKernel{ graph, id, config, tensors }, desc{ desc } + { + } + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc); + bool operator==(const ClKernel &other) const override; + Status generate(ClKernelBlueprint &bp) const override; + + ClDirectConv2dKernelDescriptor desc{}; +}; + +struct ClAddKernel : public ClKernel +{ +public: + Complexity complexity() const override + { + return Complexity::Simple; + } + ClAddKernel() = default; + ~ClAddKernel() override = default; + ClAddKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClEltwiseAddKernelDescriptor &desc, const ITensorDescPack tensors) + : ClKernel{ graph, id, config, tensors }, desc{ desc } + { + } + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst); + bool operator==(const ClKernel &other) const override; + Status generate(ClKernelBlueprint &bp) const override; + + ClEltwiseAddKernelDescriptor desc{}; +}; + +struct ClKernelGraph +{ +public: + using Id = DependencyGraph::Id; + using KernelMap = std::map>; + using KernelTensorMap = std::map>; + + ClKernelGraph() = default; + ~ClKernelGraph() = default; + + friend bool operator==(const ClKernelGraph &graph0, const ClKernelGraph &graph1) + { + return graph0.graph == graph1.graph && graph0.kernels == graph1.kernels && graph0.tensors == graph1.tensors; + } + + Status add_kernel_tensor(ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info, Id &tensor_id, Id merge_point = DependencyGraph::empty_id()) + { + tensor_id = graph.add_tensor(merge_point); + if(tensors.find(tensor_id) == tensors.end()) + { + tensors[tensor_id] = utils::memory::make_deep_unique(tensor_id, desc, memory_type, memory_info); + } + return Status{}; + } + + template + Status add_kernel(const ClKernelConfig &config, const KernelDescT &desc, const ITensorDescPack &tensors, Id &kernel_id) + { + const auto src_tensors = tensors.get_const_src_tensors(); + const auto dst_tensors = tensors.get_const_dst_tensors(); + std::vector src_tensor_ids{}; + std::vector dst_tensor_ids{}; + for(const auto &t : src_tensors) + { + src_tensor_ids.push_back(t->id); + } + for(const auto &t : dst_tensors) + { + dst_tensor_ids.push_back(t->id); + } + kernel_id = graph.add_operator(src_tensor_ids, dst_tensor_ids).second; + auto k = utils::memory::make_deep_unique(this, kernel_id, config, desc, tensors); + kernels[kernel_id] = std::move(k); + return Status{}; + } + + ClKernel *get_kernel(Id id) + { + return kernels.at(id).get(); + } + const ClKernel *get_kernel(Id id) const + { + return kernels.at(id).get(); + } + + ClKernelTensor *get_tensor(Id id) + { + return tensors.at(id).get(); + } + const ClKernelTensor *get_tensor(Id id) const + { + return tensors.at(id).get(); + } + + DependencyGraph graph{}; + KernelMap kernels{}; + KernelTensorMap tensors{}; +}; +using Id = DependencyGraph::Id; + +std::vector traverse(const ClKernelGraph &graph); +std::vector traverse(ClKernelGraph &graph); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp new file mode 100644 index 0000000000..e97cf88b79 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/ClWorkload.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx) +{ + workload.context = ctx; + ClKernelGraph kernel_graph; + workload.status = validate(op_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + workload.status = translate(kernel_graph, *op_graph.impl()); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + ClFusedKernelGraph fused_k_graph; + std::tie(workload.status, fused_k_graph) = init_fusion_graph(kernel_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + workload.status = fuse(fused_k_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + workload.status = generate(workload, ctx, fused_k_graph); + ARM_COMPUTE_RETURN_ON_ERROR(workload.status); + + // Get operator tensor id to workload tensor id map + const auto op_tensor_to_kernel_tensor = fused_k_graph.original_graph->graph.get_merge_points(); + const auto kernel_tensor_to_workload_tensor = workload.graph.get_merge_points(); + for(const auto op_t : op_graph.impl()->graph.src_tensors()) + { + const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t); + const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t); + workload.op_tensor_id_lut[workload_t] = op_t; + } + for(const auto op_t : op_graph.impl()->graph.dst_tensors()) + { + const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t); + const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t); + workload.op_tensor_id_lut[workload_t] = op_t; + } + return workload.status; +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp new file mode 100644 index 0000000000..2e8292bbfb --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/DependencyGraph.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +DependencyGraph::DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map merge_points) + : _adj_src_tensors{ adj_src_tensors }, _adj_dst_tensors{ adj_dst_tensors }, _adj_src_ops{ adj_src_ops }, _adj_dst_ops{ adj_dst_ops }, _merge_to_internal{ merge_points }, _operator_id{}, _tensor_id{} +{ +} +DependencyGraph::DependencyGraph(const std::vector &imported_tensors) + : _adj_src_tensors{}, _adj_dst_tensors{}, _adj_src_ops{}, _adj_dst_ops{}, _merge_to_internal{}, _operator_id{}, _tensor_id{} +{ + for(auto t : imported_tensors) + { + _adj_src_ops[t] = {}; + _adj_dst_ops[t] = {}; + } +} + +Status DependencyGraph::update_merge_point(Id t_id, Id merge_point) +{ + if(_merge_to_internal.find(merge_point) == _merge_to_internal.end()) + { + return Status{ ErrorCode::RUNTIME_ERROR, "Merge point does not exist" }; + } + _merge_to_internal[merge_point] = t_id; + return Status{}; +} + +DependencyGraph::Id DependencyGraph::add_tensor(Id merge_tensor) +{ + Id new_tensor{ empty_id() }; + if(merge_tensor != empty_id()) + { + if(_merge_to_internal.find(merge_tensor) != _merge_to_internal.end()) + { + new_tensor = _merge_to_internal[merge_tensor]; + } + else + { + new_tensor = insert_new_tensor(); + _merge_to_internal[merge_tensor] = new_tensor; + } + } + else + { + new_tensor = insert_new_tensor(); + } + return new_tensor; +} + +void DependencyGraph::remove_tensor(Id tensor) +{ + for(auto src_op : _adj_src_ops.at(tensor)) + { + auto &dst_tensors = _adj_dst_tensors.at(src_op); + dst_tensors.erase( + std::remove(std::begin(dst_tensors), std::end(dst_tensors), tensor), + std::end(dst_tensors)); + } + for(auto dst_op : _adj_dst_ops.at(tensor)) + { + auto &src_tensors = _adj_src_tensors.at(dst_op); + src_tensors.erase( + std::remove(std::begin(src_tensors), std::end(src_tensors), tensor), + std::end(src_tensors)); + } + _adj_src_ops.erase(tensor); + _adj_dst_ops.erase(tensor); +} + +std::pair DependencyGraph::add_operator(const std::vector &inputs, const std::vector &outputs) +{ + Id new_op = insert_new_op(); + for(Id tensor : inputs) + { + link_input(new_op, tensor); + } + for(Id tensor : outputs) + { + link_output(new_op, tensor); + } + + // Use topological sort in order to detect possible loops / cycles. + // NOTE: This is unscalable. We'll need to have a better way of detecting loops or relax this invariant during operation, and add a validate method instead + return std::pair(topological_sort().first, new_op); +} + +void DependencyGraph::remove_operator(Id op) +{ + for(auto src_tensor : _adj_src_tensors.at(op)) + { + auto &dst_ops = _adj_dst_ops.at(src_tensor); + dst_ops.erase( + std::remove(std::begin(dst_ops), std::end(dst_ops), op), + std::end(dst_ops)); + } + for(auto dst_tensor : _adj_dst_tensors.at(op)) + { + auto &src_ops = _adj_src_ops.at(dst_tensor); + src_ops.erase( + std::remove(std::begin(src_ops), std::end(src_ops), op), + std::end(src_ops)); + } + _adj_src_tensors.erase(op); + _adj_dst_tensors.erase(op); +} + +std::map DependencyGraph::get_merge_points() const +{ + return _merge_to_internal; +} + +std::vector DependencyGraph::get_root_ops() const +{ + std::vector ops{}; + const auto op_list = all_ops(); + + for(auto op : op_list) + { + if(src_ops(op).empty()) + { + ops.emplace_back(op); + } + } + return ops; +} + +std::vector DependencyGraph::get_dst_ops() const +{ + std::vector ops{}; + const auto op_list = all_ops(); + + for(auto op : op_list) + { + if(dst_ops(op).empty()) + { + ops.emplace_back(op); + } + } + return ops; +} + +std::vector DependencyGraph::src_tensors(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_src_tensors.at(op); +} + +std::vector DependencyGraph::dst_tensors(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + return _adj_dst_tensors.at(op); +} + +std::vector DependencyGraph::src_tensors() const +{ + std::vector tensors; + for(auto tensor_src_ops : _adj_src_ops) + { + if(tensor_src_ops.second.empty()) + tensors.push_back(tensor_src_ops.first); + } + return tensors; +} + +std::vector DependencyGraph::dst_tensors() const +{ + std::vector tensors; + for(auto tensor_dst_ops : _adj_dst_ops) + { + if(tensor_dst_ops.second.empty()) + tensors.push_back(tensor_dst_ops.first); + } + return tensors; +} + +std::vector DependencyGraph::src_ops_from_tensor(Id tensor) const +{ + return _adj_src_ops.at(tensor); +} +std::vector DependencyGraph::dst_ops_from_tensor(Id tensor) const +{ + return _adj_dst_ops.at(tensor); +} + +std::vector DependencyGraph::all_ops() const +{ + std::vector ops{}; + std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it) + { + return it.first; + }); + return ops; +} + +bool DependencyGraph::path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const +{ + for(auto child_op : dst_ops_from_tensor(src_tensor)) + { + if(path_exists_from_op_to_op(child_op, dst_op)) + { + return true; + } + } + return false; +} + +bool DependencyGraph::path_exists_from_op_to_op(Id src_op, Id dst_op) const +{ + if(src_op == dst_op) + { + return true; + } + if(is_in(src_op, get_dst_ops())) + { + return false; + } + for(auto child_tensor : dst_tensors(src_op)) + { + if(path_exists_from_tensor_to_op(child_tensor, dst_op)) + { + return true; + } + } + return false; +} + +std::vector DependencyGraph::all_tensors() const +{ + std::vector tensors{}; + std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it) + { + return it.first; + }); + return tensors; +} + +unsigned int DependencyGraph::number_of_ops() const +{ + return _adj_src_tensors.size(); +} + +unsigned int DependencyGraph::number_of_tensors() const +{ + return _adj_src_ops.size(); +} + +DependencyGraph::Id DependencyGraph::insert_new_tensor() +{ + Id new_tensor = _tensor_id.alloc(); + _adj_src_ops[new_tensor] = {}; + _adj_dst_ops[new_tensor] = {}; + return new_tensor; +} +DependencyGraph::Id DependencyGraph::insert_new_op() +{ + Id new_op = _operator_id.alloc(); + _adj_src_tensors[new_op] = {}; + _adj_dst_tensors[new_op] = {}; + return new_op; +} +void DependencyGraph::link_input(Id op, Id in_tensor) +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + ARM_COMPUTE_ERROR_ON(!tensor_exists(in_tensor)); + ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); + _adj_src_tensors[op].push_back(in_tensor); + _adj_dst_ops[in_tensor].push_back(op); +} +void DependencyGraph::link_output(Id op, Id out_tensor) +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + ARM_COMPUTE_ERROR_ON(!tensor_exists(out_tensor)); + ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); + _adj_dst_tensors[op].push_back(out_tensor); + _adj_src_ops[out_tensor].push_back(op); +} +bool DependencyGraph::tensor_exists(Id tensor) const +{ + return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end(); +} +bool DependencyGraph::operator_exists(Id op) const +{ + return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); +} + +bool DependencyGraph::is_src_tensor(Id tensor) const +{ + if(!tensor_exists(tensor)) + { + return false; + } + return _adj_src_ops.at(tensor).empty(); +} + +bool DependencyGraph::is_dst_tensor(Id tensor) const +{ + if(!tensor_exists(tensor)) + { + return false; + } + return _adj_dst_ops.at(tensor).empty(); +} +bool DependencyGraph::is_src_tensor_of(Id op, Id tensor) const +{ + if(!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_inputs = src_tensors(op); + return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end(); +} +bool DependencyGraph::is_dst_tensor_of(Id op, Id tensor) const +{ + if(!operator_exists(op) || !tensor_exists(tensor)) + { + return false; + } + const auto op_outputs = dst_tensors(op); + return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end(); +} +bool DependencyGraph::are_connected(Id op, Id tensor) const +{ + return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor); +} +std::vector DependencyGraph::src_ops(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector ops{}; + for(Id src_tensor : src_tensors(op)) + { + ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); + } + return ops; +} + +std::vector DependencyGraph::dst_ops(Id op) const +{ + ARM_COMPUTE_ERROR_ON(!operator_exists(op)); + std::vector ops{}; + for(Id dst_tensor : _adj_dst_tensors.at(op)) + { + ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); + } + return ops; +} + +std::pair> DependencyGraph::topological_sort() const +{ + // Incident degree (number of source operators to an op) + std::map in_degree{}; + std::set visited_ops{}; + std::deque zero_in_degree_ops{}; + std::vector sorted_op_packs{}; + for(auto op : all_ops()) + { + const auto degree = src_ops(op).size(); + in_degree[op] = degree; + if(degree == 0) + { + zero_in_degree_ops.push_back(op); + visited_ops.insert(op); + } + } + + while(!zero_in_degree_ops.empty()) + { + const Id op = zero_in_degree_ops.front(); + zero_in_degree_ops.pop_front(); + sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); + + for(const auto next_op : dst_ops(op)) + { + if(in_degree[next_op] > 0) + { + in_degree[next_op]--; + } + if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end()) + { + zero_in_degree_ops.push_back(next_op); + visited_ops.insert(op); + } + } + } + + // If there are remaining ops with in_degree > 0, then it's indication that there are cycles in the graph + Status st{}; + if(sorted_op_packs.size() != number_of_ops()) + { + st = Status{ ErrorCode::RUNTIME_ERROR, "Cycles or loops are not allowed in a DependencyGraph" }; + } + return std::make_pair(st, sorted_op_packs); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h new file mode 100644 index 0000000000..bfa2eacfed --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H + +#include +#include +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +template +class ITensorDescPack +{ +public: + struct PackElement + { + PackElement() = default; + ~PackElement() = default; + PackElement(const PackElement &) = default; + PackElement &operator=(const PackElement &) = default; + PackElement(PackElement &&) = default; + PackElement &operator=(PackElement &&) = default; + PackElement(int id, TDesc *tensor) + : id(id), tensor(tensor), ctensor(nullptr) + { + } + PackElement(int id, const TDesc *ctensor) + : id(id), tensor(nullptr), ctensor(ctensor) + { + } + + int id{ -1 }; + TDesc *tensor{ nullptr }; + const TDesc *ctensor{ nullptr }; + + friend bool operator==(const PackElement &elem0, const PackElement &elem1) + { + const bool same_ctensor = (elem0.tensor == nullptr && elem1.tensor == nullptr && elem0.ctensor != nullptr && elem1.ctensor != nullptr && *elem0.ctensor == *elem1.ctensor); + const bool same_tensor = (elem0.ctensor == nullptr && elem1.ctensor == nullptr && elem0.tensor != nullptr && elem1.tensor != nullptr && *elem0.tensor == *elem1.tensor); + + return elem0.id == elem1.id && (same_ctensor || same_tensor); + } + }; + +public: + /** Default Constructor */ + ITensorDescPack() = default; + ~ITensorDescPack() = default; + ITensorDescPack(const ITensorDescPack &other) = default; + ITensorDescPack &operator=(const ITensorDescPack &other) = default; + ITensorDescPack(ITensorDescPack &&other) = default; + ITensorDescPack &operator=(ITensorDescPack &&other) = default; + /** Initializer list Constructor */ + ITensorDescPack(std::initializer_list l) + : _pack{} + { + for(auto &e : l) + { + _pack[e.id] = e; + } + } + /** Add tensor to the pack + * + * @param[in] id ID/type of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_tensor(int id, TDesc *tensor) + { + _pack[id] = PackElement(id, tensor); + } + + /** Add const tensor to the pack + * + * @param[in] id ID/type of the tensor to add + * @param[in] tensor Tensor to add + */ + void add_const_tensor(int id, const TDesc *tensor) + { + _pack[id] = PackElement(id, tensor); + } + /** Get tensor of a given id from the pac + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor if exist and is non-const else nullptr + */ + TDesc *get_tensor(int id) + { + auto it = _pack.find(id); + return it != _pack.end() ? it->second.tensor : nullptr; + } + /** Get constant tensor of a given id + * + * @param[in] id ID of tensor to extract + * + * @return The pointer to the tensor if exist and is const else nullptr + */ + const TDesc *get_const_tensor(int id) const + { + auto it = _pack.find(id); + if(it != _pack.end()) + { + return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; + } + return nullptr; + } + /** Remove the tensor stored with the given id + * + * @param[in] id ID of tensor to remove + */ + void remove_tensor(int id) + { + _pack.erase(id); + } + /** Pack size accessor + * + * @return Number of tensors registered to the pack + */ + size_t size() const + { + return _pack.size(); + } + /** Checks if pack is empty + * + * @return True if empty else false + */ + bool empty() const + { + return _pack.empty(); + } + + /** Get the ACL_SRC_* tensors + * + * @return std::vector + */ + std::vector get_src_tensors() + { + std::vector src_tensors{}; + for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_tensor(id); + if(tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the const ACL_SRC_* tensors + * + * @return std::vector + */ + std::vector get_const_src_tensors() const + { + std::vector src_tensors{}; + for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) + { + auto tensor = get_const_tensor(id); + if(tensor != nullptr) + { + src_tensors.push_back(tensor); + } + } + return src_tensors; + } + /** Get the ACL_DST_* tensors + * + * @return std::vector + */ + std::vector get_dst_tensors() + { + std::vector dst_tensors{}; + for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_tensor(id); + if(tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + /** Get the const ACL_DST_* tensors + * + * @return std::vector + */ + std::vector get_const_dst_tensors() const + { + std::vector dst_tensors{}; + for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) + { + auto tensor = get_const_tensor(id); + if(tensor != nullptr) + { + dst_tensors.push_back(tensor); + } + } + return dst_tensors; + } + + friend bool operator==(const ITensorDescPack &pack0, const ITensorDescPack &pack1) + { + return pack0._pack == pack1._pack; + } + +private: + std::unordered_map _pack{}; /**< Container with the packed tensors */ +}; + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp new file mode 100644 index 0000000000..4b91c0f156 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp @@ -0,0 +1,387 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, MemoryType memory_type, AuxMemoryInfo memory_info, + DependencyGraph::Id &id) +{ + ARM_COMPUTE_UNUSED(op_graph); + return k_graph.add_kernel_tensor(op_tensor.desc, memory_type, memory_info, id, op_tensor.id); +} + +Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, DependencyGraph::Id &id) +{ + // For a tensor t + // 1. If t is a src tensor of the entire op graph, then it's Core. + // (Optimisation opportunity, if we guanrantee that all translate methods are called in topological order, we can always assign t to Core. + // Because even if the op is non-root (which would mean t should be an Aux tensor), the src tensors would be already be determined by the ancestor ops (topological order), and thus would not be overriden by it) + // 2. If t is a dst tensor of the entire op graph, then it's Core. + // 3. Aux tensor with Persistent and Prepare lifetime is manually specified + // 4. All other ts not captured by the above are assigned Aux, with lifetime of Temporary. + // kernel_graph.add_kernel_tensor(input->desc, ); + bool is_src_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.src_tensors()); + bool is_dst_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.dst_tensors()); + MemoryType memory_type; + AuxMemoryInfo memory_info; + if(is_src_tensor_of_graph || is_dst_tensor_of_graph) + { + memory_type = MemoryType::Core; + } + else + { + memory_type = MemoryType::Auxiliary; + memory_info.lifetime = AuxMemoryLifetime::Temporary; + memory_info.size = op_tensor.desc->total_size(); + } + return add_kernel_tensor(k_graph, op_graph, op_tensor, memory_type, memory_info, id); +} + +/** Get the suitable kernel size for using direct convolution method with NHWC data layout. + * + * @note Duplicate of the function with the same name in src/gpu/cl/operators/ClConv2d.cpp + * + * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function + * + * @param[in] gpu_target GPU target + * + * @return the suitable kernel size for using direct convolution method with NHWC data layout + */ +size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) +{ + switch(gpu_target) + { + case arm_compute::GPUTarget::G76: + case arm_compute::GPUTarget::G77: + case arm_compute::GPUTarget::G78: + return 5; + case arm_compute::GPUTarget::G71: + case arm_compute::GPUTarget::G72: + case arm_compute::GPUTarget::MIDGARD: + case arm_compute::GPUTarget::BIFROST: + return 7; + default: + return 5; + } +} +} // namespace + +bool operator==(const OpTensor &t0, const OpTensor &t1) +{ + return std::make_tuple(t0.id()) == std::make_tuple(t1.id()); +} +bool operator==(const Padding2D &pad0, const Padding2D &pad1) +{ + return std::make_tuple(pad0.top, pad0.right, pad0.bottom, pad0.left) == std::make_tuple(pad1.top, pad1.right, pad1.bottom, pad1.left); +} +bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1) +{ + return std::make_tuple(conv2d0.pad, conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.pad, conv2d1.stride, conv2d1.dilation); +} + +bool operator==(const AddDescriptor &, const AddDescriptor &) +{ + return std::make_tuple() == std::make_tuple(); // Currently two Add ops are always the same +} + +bool Conv2dContent::operator==(const OperatorContent &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast(&other); + return desc == converted.desc; +} + +bool AddContent::operator==(const OperatorContent &other) const +{ + const auto converted = *utils::cast::polymorphic_downcast(&other); + return desc == converted.desc; +} + +ConvolutionMethod Conv2dContent::select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target) +{ + // Modified from ClConv2d::get_convolution_method + + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); + + const PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); + const Size2D dilation = conv2d_desc.dilation; + + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ + using ConvolutionConfiguration = std::tuple; + using ConfigurationMethod = std::pair; + + const std::vector known_configs = + { + // Alexnet + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + // VGG16 / VGG19 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + }; + + const auto find_config = [&](ConfigurationMethod c) + { + const ConvolutionConfiguration config = c.first; + const PadStrideInfo info = std::get<3>(config); + const DataLayout data_layout = std::get<4>(config); + + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) + && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == legacy_pad_stride.pad_top() && info.pad_right() == legacy_pad_stride.pad_right() + && info.pad_bottom() == legacy_pad_stride.pad_bottom() && info.pad_left() == legacy_pad_stride.pad_left() && info.stride() == legacy_pad_stride.stride() && (data_layout == src->data_layout()); + }; + + std::vector::const_iterator found; + if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + { + return (*found).second; + } + + if(dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else + { + if(src->data_layout() == DataLayout::NCHW) + { + ARM_COMPUTE_ERROR("NCHW not supported"); + } + else + { + const bool is_direct_valid = bool(ClDirectConv2dKernel::validate(src, weights, nullptr, dst, ClDirectConv2dKernelDescriptor{ conv2d_desc })); + const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); + + // SRGAN case + if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv2d_desc.pad.top < 3) + && is_direct_valid) + { + return ConvolutionMethod::DIRECT; + } + + // Floating-point case: GeMM/Direct + if(is_data_type_float(src->data_type())) + { + // Get dst shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride); + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; + const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + + // Direct convolution case + if(is_direct_valid) + { + if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || gpu_target == arm_compute::GPUTarget::MIDGARD)) + { + if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) + { + return ConvolutionMethod::DIRECT; + } + } + else + { + if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) + { + return ConvolutionMethod::DIRECT; + } + } + } + + // Default case + return ConvolutionMethod::GEMM; + } + + // Generic case for quantized. Only GeMM + return ConvolutionMethod::GEMM; + } + } + return ConvolutionMethod::DIRECT; +} + +Status Conv2dContent::translate(ClKernelGraph &kernel_graph) const +{ + const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + const auto method = forced_method_enabled ? forced_method : Conv2dContent::select_conv_method(input->desc, weight->desc, dst->desc, desc, CLScheduler::get().target()); + switch(method) + { + case ConvolutionMethod::DIRECT: + { + return translate_direct_conv2d(kernel_graph); + } + default: + { + ARM_COMPUTE_RETURN_ERROR_MSG("Not implemented"); + } + } + return Status{}; +} +Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const +{ + const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst); + + ITensorDescPack tensors; + + DependencyGraph::Id input_id; + auto st = add_kernel_tensor(kernel_graph, *_graph, *input, input_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(input_id)); + + DependencyGraph::Id weight_id; + st = add_kernel_tensor(kernel_graph, *_graph, *weight, weight_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(weight_id)); + + if(bias != nullptr) + { + DependencyGraph::Id bias_id; + st = add_kernel_tensor(kernel_graph, *_graph, *bias, bias_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_2, kernel_graph.get_tensor(bias_id)); + } + + DependencyGraph::Id dst_id; + st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); + + DependencyGraph::Id direct_conv2d_id; + const auto kernel_desc = ClDirectConv2dKernelDescriptor{ desc }; + + st = ClDirectConv2dKernel::validate(input->desc, weight->desc, bias == nullptr ? nullptr : bias->desc, dst->desc, kernel_desc); + ARM_COMPUTE_RETURN_ON_ERROR(st); + + ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; + st = kernel_graph.add_kernel(config, kernel_desc, tensors, direct_conv2d_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + ARM_COMPUTE_UNUSED(direct_conv2d_id); + + return Status{}; +} + +Status AddContent::translate(ClKernelGraph &kernel_graph) const +{ + const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + + ITensorDescPack tensors; + + DependencyGraph::Id lhs_id; + auto st = add_kernel_tensor(kernel_graph, *_graph, *lhs, lhs_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(lhs_id)); + + DependencyGraph::Id rhs_id; + st = add_kernel_tensor(kernel_graph, *_graph, *rhs, rhs_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(rhs_id)); + + DependencyGraph::Id dst_id; + st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); + + DependencyGraph::Id add_id; + ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; + + st = ClAddKernel::validate(lhs->desc, rhs->desc, dst->desc); + ARM_COMPUTE_RETURN_ON_ERROR(st); + + st = kernel_graph.add_kernel(config, ClEltwiseAddKernelDescriptor{ desc }, tensors, add_id); + ARM_COMPUTE_RETURN_ON_ERROR(st); + ARM_COMPUTE_UNUSED(add_id); + + return Status{}; +} + +std::vector traverse(const OperatorGraph::Implementation &graph) +{ + std::vector ops; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + ops.push_back(graph.operators.at(pack.op).get()); + } + return ops; +} + +std::vector traverse(OperatorGraph::Implementation &graph) +{ + std::vector ops; + const auto sorted = graph.graph.topological_sort(); + for(const auto &pack : sorted.second) + { + ops.push_back(graph.operators.at(pack.op).get()); + } + return ops; +} + +Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph) +{ + for(const auto &op : traverse(op_graph)) + { + const auto st = op->translate(kernel_graph); + ARM_COMPUTE_RETURN_ON_ERROR(st); + } + return Status{}; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h new file mode 100644 index 0000000000..c33e189797 --- /dev/null +++ b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL +#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL + +#include "arm_compute/core/experimental/ClWorkload.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h" + +#include "support/Cast.h" +#include "support/DeepCopy.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +enum class OperatorComplexity +{ + Complex = 0, + Simple +}; + +struct ClKernelGraph; +struct OpTensorContent +{ +public: + using Id = DependencyGraph::Id; + OpTensorContent() = default; + OpTensorContent(Id id) + : id{ id }, desc{} + { + } + OpTensorContent(Id id, ITensorInfo *desc) + : id{ id }, desc{ desc } + { + } + ~OpTensorContent() = default; + OpTensorContent(const OpTensorContent &) = default; + OpTensorContent &operator=(const OpTensorContent &) = default; + OpTensorContent(OpTensorContent &&) = default; + OpTensorContent &operator=(OpTensorContent &&) = default; + bool operator==(const OpTensorContent &other) const + { + return desc == other.desc; + } + + const ITensorInfo *get_tensor_info() const + { + return desc; + } + ITensorInfo *get_tensor_info() + { + return desc; + } + + Id id{}; + ITensorInfo *desc{}; +}; + +struct OperatorContent +{ +public: + using Id = DependencyGraph::Id; + OperatorContent() = default; + OperatorContent(const OperatorGraph::Implementation *graph, Id id, const ITensorDescPack &tensors) + : _graph{ graph }, _id{ id }, _tensors{ tensors } + { + } + OperatorContent(const OperatorContent &op) = default; + OperatorContent &operator=(const OperatorContent &op) = default; + OperatorContent(OperatorContent &&op) = default; + OperatorContent &operator=(OperatorContent &&op) = default; + virtual ~OperatorContent() = default; + virtual OperatorComplexity complexity() const = 0; + virtual bool operator==(const OperatorContent &other) const = 0; + virtual Status translate(ClKernelGraph &kernel_graph) const = 0; + +protected: + const OperatorGraph::Implementation *_graph {}; + Id _id{}; + ITensorDescPack _tensors{}; +}; + +struct Conv2dContent : public OperatorContent +{ +public: + Conv2dContent() = default; + Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack &tensors) + : OperatorContent(graph, id, tensors), desc(desc), forced_method(), forced_method_enabled(false) + { + } + // Temporary. Do not need to pass ConvolutionMethod + Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack &tensors, ConvolutionMethod method) + : OperatorContent(graph, id, tensors), desc(desc), forced_method(method), forced_method_enabled(true) + { + } + ~Conv2dContent() = default; + Conv2dContent(const Conv2dContent &) = default; + Conv2dContent &operator=(const Conv2dContent &) = default; + Conv2dContent(Conv2dContent &&) = default; + Conv2dContent &operator=(Conv2dContent &&) = default; + bool operator==(const OperatorContent &other) const override; + OperatorComplexity complexity() const override + { + return OperatorComplexity::Complex; + } + void set_method(ConvolutionMethod method) + { + forced_method_enabled = true; + forced_method = method; + } + + Status translate(ClKernelGraph &kernel_graph) const override; + /** Replicate heuristics of @ref ClConv2d::get_convolution_method(), except that non-supported data types and data layouts are removed from the heuristics + * + * @param src + * @param weights + * @param dst + * @param conv2d_desc + * @param gpu_target + * @return ConvolutionMethod + */ + static ConvolutionMethod select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target); + + Conv2dDescriptor desc{}; + ConvolutionMethod forced_method{ ConvolutionMethod::GEMM_CONV2D }; + bool forced_method_enabled{ false }; + +private: + Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const; +}; + +class AddContent : public OperatorContent +{ +public: + AddContent() = default; + AddContent(const OperatorGraph::Implementation *graph, Id id, const AddDescriptor &desc, const ITensorDescPack &tensors) + : OperatorContent(graph, id, tensors), desc(desc) + { + } + ~AddContent() = default; + AddContent(const AddContent &) = default; + AddContent &operator=(const AddContent &) = default; + AddContent(AddContent &&) = default; + AddContent &operator=(AddContent &&) = default; + bool operator==(const OperatorContent &other) const override; + OperatorComplexity complexity() const override + { + return OperatorComplexity::Simple; + } + Status translate(ClKernelGraph &kernel_graph) const override; + +private: + AddDescriptor desc{}; +}; + +struct OperatorGraph::Implementation +{ +public: + template + void add_node(Operator::Id id, Args &&... args) + { + operators[id] = utils::memory::make_deep_unique(this, id, std::forward(args)...); + } + + template + void add_tensor(OpTensor::Id id, Args &&... args) + { + tensors[id] = utils::memory::make_deep_unique(id, std::forward(args)...); + } + + using Dependency = DependencyGraph; + using OperatorMap = std::map>; + using OpTensorMap = std::map>; + + Implementation() = default; + ~Implementation() = default; + + friend bool operator==(const OperatorGraph::Implementation &graph0, const OperatorGraph::Implementation &graph1) + { + return graph0.graph == graph1.graph && graph0.operators == graph1.operators && graph0.tensors == graph1.tensors; + } + + Dependency graph{}; + OperatorMap operators{}; + OpTensorMap tensors{}; + Status status{}; +}; + +std::vector traverse(const OperatorGraph::Implementation &graph); + +std::vector traverse(OperatorGraph::Implementation &graph); + +Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph); + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute + +#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL \ No newline at end of file diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp index 472cfb9df0..6c8e4abde7 100644 --- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp +++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp @@ -21,13 +21,18 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "src/core/CL/CLUtils.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" #include "src/gpu/cl/ClKernelLibrary.h" +#include "support/Cast.h" namespace arm_compute { namespace experimental @@ -57,81 +62,88 @@ void ClCompositeKernel::configure(const ClCompileContext &compile_ctx, const ClK _arguments = cl_code.arguments; } -inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgRuntimeDescriptor &arg, ICLTensor *tensor, const Window &arg_slice) +inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images) { switch(arg.tensor_arg_type) { - case TensorArgType::Scalar: + case ClKernelTensorArgType::Scalar: { ARM_COMPUTE_ERROR("Unsupported yet"); break; } - case TensorArgType::Vector: + + case ClKernelTensorArgType::Vector: { add_1D_tensor_argument(idx, tensor, arg_slice); break; } - case TensorArgType::Image: + case ClKernelTensorArgType::Image: { add_2D_tensor_argument(idx, tensor, arg_slice); break; } - case TensorArgType::Image_Reinterpret_As_3D: + case ClKernelTensorArgType::Image_Reinterpret_As_3D: { add_2D_tensor_argument(idx, tensor, arg_slice); const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom; _kernel.setArg(idx++, static_cast(total_cross_plane_pad)); break; } - case TensorArgType::Image_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_Export_To_ClImage2D: { const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); + cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); break; } - case TensorArgType::Image_3D: + + case ClKernelTensorArgType::Image_3D: { add_2D_tensor_argument(idx, tensor, arg_slice); _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); break; } - case TensorArgType::Image_3D_Export_To_ClImage2D: + case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: { const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); + cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); break; } - case TensorArgType::Tensor_3D: + + case ClKernelTensorArgType::Tensor_3D: { add_3D_tensor_argument(idx, tensor, arg_slice); break; } - case TensorArgType::Tensor_4D: + + case ClKernelTensorArgType::Tensor_4D: { add_4D_tensor_argument(idx, tensor, arg_slice); break; } - case TensorArgType::Tensor_4D_t_Buffer: + case ClKernelTensorArgType::Tensor_4D_t_Buffer: { add_4d_tensor_nhwc_argument(idx, tensor); break; } - case TensorArgType::Tensor_4D_t_Image: + case ClKernelTensorArgType::Tensor_4D_t_Image: { const size_t image_w = tensor->info()->dimension(0) / 4; const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1); const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), - TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y); + cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), + TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y); + cl_images.push_back(tensor_image2d); - _kernel.setArg(idx++, tensor_cl_image); + _kernel.setArg(idx++, tensor_image2d); add_4d_tensor_nhwc_argument(idx, tensor); break; } @@ -142,7 +154,7 @@ inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKe } } -void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) +void ClCompositeKernel::run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) { ARM_COMPUTE_UNUSED(exec_desc); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -160,17 +172,21 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w { // Set kernel arguments Window arg_slice = slice; - for(auto arg : _arguments) + // CLImages created from tensor arguments. Need to be retained until enqueue + std::vector cl_images; + for(auto id_arg : _arguments) { - auto tensor = tensors._binding.at(arg.arg_id); + const auto arg = id_arg.second; + auto tensor = utils::cast::polymorphic_downcast(tensors.get_tensor(arg.arg_id)); ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info()); if(!arg.slide_along_dimz) { // The stride_z for matrix must be zero if we do not slice ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0); arg_slice = slice_fixed_z; } - add_tensor_argument(idx, arg, tensor, arg_slice); + add_tensor_argument(idx, arg, tensor, arg_slice, cl_images); } // Dispatch kernel @@ -180,12 +196,6 @@ void ClCompositeKernel::run_composite_op(TensorBinding &tensors, const Window &w while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice)); } -Status bind_arguments(ITensorPack &, const ClKernelCode &, const TensorBinding &) -{ - return Status{}; -} } // namespace dynamic_fusion } // namespace experimental -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h index 19efb505eb..bf70d6a226 100644 --- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h +++ b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h @@ -21,13 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H #define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" - +#include "arm_compute/core/experimental/ClWorkload.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -37,47 +38,40 @@ namespace experimental { namespace dynamic_fusion { -struct TensorBinding -{ - TensorBinding(const std::map binding) - : _binding{ binding } - { - } - bool empty() const - { - return _binding.empty(); - } - std::map _binding; -}; -class ClCompositeKernel : public opencl::IClKernel +struct ClExecutionDescriptor; +struct ClKernelCode; + +class ClCompositeKernel final : public opencl::IClKernel { public: void configure(const opencl::ClCompileContext &, const ClKernelCode &); /** Run the composite kernel + * @note The slots / keys in ITensorPack are the argument Ids of the tensors in blueprint * - * @param tensors TensorBinding object containing run-time tensors information + * @param tensors ITensorPack object containing run-time tensor memories * @param window Execution window * @param queue OpenCL Command queue * @param exec_desc Descriptor containing execution information */ - virtual void run_composite_op(TensorBinding &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override; + virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override; private: - inline void add_tensor_argument(unsigned int &idx, const ClKernelArgRuntimeDescriptor &arg, ICLTensor *tensor, const Window &arg_slice); + /** Set a kernel tensor argument + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] arg Kernel argument descriptor accompanying @p tensor + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] arg_slice Window the kernel will be run on. + * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) + */ + inline void add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images); private: ClKernelArgList _arguments{}; /** All kernel arguments required by runtime */ }; -/** Argument Binding. - * Tensor Arguments to ICLKernel run_op method need to be passed via an ITensorPack. So the bind_arguments is essentially a converter from TensorBinding to ITensorPack - */ -Status bind_arguments(ITensorPack &tensor_pack, const ClKernelCode &, const TensorBinding &); - } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H \ No newline at end of file diff --git a/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp b/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp new file mode 100644 index 0000000000..984de74249 --- /dev/null +++ b/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/runtime/experimental/ClCompositeOperator.h" + +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/core/experimental/Types.h" +#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" +#include "support/Cast.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +Status add_tensor_to_tensor_pack(int wk_tensor_id, ICLTensor *tensor, const ClWorkload &workload, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map) +{ + if(tensor == nullptr) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs"); + } + const auto bp_tensor_id = workload.tensors.at(wk_tensor_id).kernel_arg.arg_id; // blueprint tensor id + std::vector uwk_ids{}; + const auto src_uwk_ids = workload.graph.src_ops_from_tensor(wk_tensor_id); + const auto dst_uwk_ids = workload.graph.dst_ops_from_tensor(wk_tensor_id); + uwk_ids.insert(uwk_ids.end(), src_uwk_ids.begin(), src_uwk_ids.end()); + uwk_ids.insert(uwk_ids.end(), dst_uwk_ids.begin(), dst_uwk_ids.end()); + + for(auto uwk_id : uwk_ids) + { + TensorPackMap *pack_map = nullptr; + const auto uwk_stage = workload.unit_workloads.at(uwk_id).stage.stage; + switch(uwk_stage) + { + case UnitWorkloadStage::Stage::Run: + pack_map = &run_pack_map; + break; + case UnitWorkloadStage::Stage::Prepare: + pack_map = &prepare_pack_map; + break; + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported workload stage"); + } + + ITensorPack *tensor_pack = pack_map->find_tensor_pack(uwk_id); + if(tensor_pack == nullptr) + { + pack_map->add_tensor_pack(uwk_id, ITensorPack{ { bp_tensor_id, tensor } }); + } + else + { + tensor_pack->add_tensor(bp_tensor_id, tensor); + } + } + return Status{}; +} + +} // namespace + +ITensorPack *TensorPackMap::find_tensor_pack(UnitWorkload::Id uwk_id) +{ + auto tensor_pack = _tensor_packs.find(uwk_id); + if(tensor_pack != _tensor_packs.end()) + { + return &(tensor_pack->second); + } + return nullptr; +} + +ITensorPack &TensorPackMap::get_tensor_pack(UnitWorkload::Id uwk_id) +{ + return _tensor_packs.at(uwk_id); +} + +void TensorPackMap::add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack) +{ + _tensor_packs[uwk_id] = tensor_pack; +} + +Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors) +{ + for(auto tensor : workload.tensors) + { + const auto wk_tensor_id = tensor.first; // workload tensor id + ICLTensor *tensor_object = nullptr; + if(tensor.second.memory_type == MemoryType::Core) + { + const auto op_tensor_id = workload.op_tensor_id_lut.at(wk_tensor_id); + auto op_tensor_find = op_tensors.find(op_tensor_id); + if(op_tensor_find == op_tensors.end()) + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Cannot find binding for some operator tensor"); + } + tensor_object = utils::cast::polymorphic_downcast(op_tensor_find->second); + } + else if(tensor.second.memory_type == MemoryType::Auxiliary) + { + // Create aux tensor CLTensor object + const TensorInfo tensor_info = *tensor.second.info; + const auto memory_info = tensor.second.memory_info; + tensor_object = aux_tensor_data.add_aux_tensor(wk_tensor_id, tensor_info, memory_info); + } + else + { + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported tensor memory type"); + } + + const auto st = add_tensor_to_tensor_pack(wk_tensor_id, tensor_object, workload, prepare_pack_map, run_pack_map); + ARM_COMPUTE_RETURN_ON_ERROR(st); + } + return Status{}; +} + +CLTensor *ClAuxTensorData::add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info) +{ + auto find_tensor_pair = _owned_tensors.find(tensor_id); + if(find_tensor_pair == _owned_tensors.end()) + { + return find_tensor_pair->second.get(); + } + else + { + auto tensor = std::make_unique(); + auto inserted_pair = _owned_tensors.emplace(tensor_id, std::move(tensor)).first; + auto new_tensor = inserted_pair->second.get(); + _tensors.emplace_back(new_tensor, tensor_info, memory_info); + return new_tensor; + } +} + +std::vector &ClAuxTensorData::get_tensors() +{ + return _tensors; +} +struct ClCompositeOperator::Implementation +{ + std::map> _kernels{}; + std::map> _kernels_prep{}; + ClWorkload _workload{}; + bool _is_prepared{ false }; +}; + +ClCompositeOperator::ClCompositeOperator() + : _impl{ std::make_unique() } +{ +} + +ClCompositeOperator::~ClCompositeOperator() = default; + +void ClCompositeOperator::configure(const CLCompileContext &ctx, const ClWorkload &workload) +{ + ARM_COMPUTE_ERROR_THROW_ON(ClCompositeOperator::validate(workload)); + _impl->_workload = workload; + + // Traverse workloads in topological order + const auto sorted = workload.graph.topological_sort().second; + for(const auto &node : sorted) + { + auto work = workload.unit_workloads.at(node.op); + auto stage = work.stage.stage; + auto k = std::make_unique(); + k->configure(ctx, work.code); + + switch(stage) + { + case UnitWorkloadStage::Stage::Run: + _impl->_kernels.emplace(work.id, std::move(k)); + break; + case UnitWorkloadStage::Stage::Prepare: + _impl->_kernels_prep.emplace(work.id, std::move(k)); + break; + default: + ARM_COMPUTE_ERROR("Invalid stage"); + } + break; + } +} + +Status ClCompositeOperator::validate(const ClWorkload &workload) +{ + return workload.status; +} + +void ClCompositeOperator::prepare(TensorPackMap &tensor_pack_map) +{ + if(!_impl->_is_prepared) + { + for(auto &id_kernel_pair : _impl->_kernels_prep) + { + const bool flush_queue = false; + const auto uwk_id = id_kernel_pair.first; + auto kernel = id_kernel_pair.second.get(); + CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue); + } + + _impl->_is_prepared = true; + } +} + +void ClCompositeOperator::run(TensorPackMap &tensor_pack_map) +{ + ARM_COMPUTE_ERROR_ON_MSG(!_impl->_is_prepared, "Operator is not prepared"); + + for(auto &id_kernel_pair : _impl->_kernels) + { + // Flush the command queue on the last kernel + const bool flush_queue = false; + const auto uwk_id = id_kernel_pair.first; + auto kernel = id_kernel_pair.second.get(); + CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue); + } +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index 4cff707f1a..26124ed7e9 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -191,7 +191,7 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -void CLScheduler::enqueue_common(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush) +void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush) { ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ @@ -246,7 +246,7 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -void CLScheduler::enqueue_op(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush) +void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush) { enqueue_common(kernel, tensors, exec_desc, flush); } diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 81fe7dbde6..8ce5177847 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -68,7 +68,7 @@ private: #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) struct CompositeKernelData : public CLTuner::IKernelData { - CompositeKernelData(experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) + CompositeKernelData(ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) : _tensors{ tensors }, _exec_desc{ exec_desc } { } @@ -80,7 +80,7 @@ struct CompositeKernelData : public CLTuner::IKernelData } private: - experimental::dynamic_fusion::TensorBinding &_tensors; + ITensorPack &_tensors; const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc; }; #endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) @@ -166,7 +166,7 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) } #if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, experimental::dynamic_fusion::TensorBinding &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) +void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) { CompositeKernelData data{ tensors, exec_desc }; diff --git a/support/DeepCopy.h b/support/DeepCopy.h new file mode 100644 index 0000000000..0117897901 --- /dev/null +++ b/support/DeepCopy.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_MISC_ITERABLE_H +#define ARM_COMPUTE_MISC_ITERABLE_H +namespace arm_compute +{ +namespace utils +{ +namespace memory +{ +namespace +{ +/** Default polymorphic deep copy function, used by deep_unique_ptr + * + * @param ptr Potentially polymorphic object to be deep copied + * @return template * + */ +template +Base *default_polymorphic_copy(const Base *ptr) +{ + static_assert(std::is_base_of::value, + "Derived is not a specialization of Base"); + if(ptr == nullptr) + { + return nullptr; + } + return new Derived(*static_cast(ptr)); +} +} // namespace + +/** A deep-copying unique pointer that also supports polymorphic cloning behavior + * + * @note The == operator compares the dereferenced value instead of the pointer itself. + * + * @tparam Base Base type + */ +template +class deep_unique_ptr +{ +public: + using CopyFunc = std::function; + + deep_unique_ptr(std::nullptr_t val = nullptr) noexcept + : _val{ val }, + _copy{} + { + } + template + deep_unique_ptr(Derived *value, const CopyFuncDerived ©) noexcept + : _val{ value }, + _copy{ std::move(copy) } + { + static_assert(std::is_base_of::value, + "Derived is not a specialization of Base"); + static_assert( + std::is_constructible::value, + "CopyFuncDerived is not valid for a copy functor"); + } + + deep_unique_ptr(const deep_unique_ptr &ptr) + : deep_unique_ptr(ptr.clone()) + { + } + deep_unique_ptr &operator=(const deep_unique_ptr &ptr) + { + deep_unique_ptr tmp(ptr); + swap(*this, tmp); + return *this; + } + + deep_unique_ptr(deep_unique_ptr &&ptr) = default; + deep_unique_ptr &operator=(deep_unique_ptr &&ptr) = default; + ~deep_unique_ptr() = default; + friend void swap(deep_unique_ptr &ptr0, deep_unique_ptr &ptr1) noexcept + { + using std::swap; + swap(ptr0._val, ptr1._val); + swap(ptr0._copy, ptr1._copy); + } + Base &operator*() noexcept + { + return *_val; + } + + const Base &operator*() const noexcept + { + return *_val; + } + + Base *operator->() noexcept + { + return _val.operator->(); + } + + const Base *operator->() const noexcept + { + return _val.operator->(); + } + + Base *get() noexcept + { + return _val.get(); + } + const Base *get() const noexcept + { + return _val.get(); + } + + explicit operator bool() const noexcept + { + return static_cast(_val); + } + + bool operator==(const deep_unique_ptr &rhs) const + { + if(rhs.get() == nullptr && _val == nullptr) + { + return true; + } + else if(rhs.get() == nullptr || _val == nullptr) + { + return false; + } + else + { + return (*_val == *rhs); + } + } + +private: + deep_unique_ptr clone() const + { + return { _copy(_val.get()), CopyFunc(_copy) }; + } + std::unique_ptr _val{ nullptr }; + CopyFunc _copy{}; +}; + +/** Utility function to create a polymorphic deep-copying unique pointer + * + * @tparam Base + * @tparam Derived + * @tparam CopyFunc + * @param temp + * @param copy + * @return deep_unique_ptr + */ +template +deep_unique_ptr make_deep_unique(Derived &&temp, CopyFunc copy) +{ + return + { + new Derived(std::move(temp)), + CopyFunc{ std::move(copy) } + }; +} + +template +deep_unique_ptr make_deep_unique(Derived &&temp) +{ + static_assert(std::is_base_of::value, + "Derived is not a specialization of Base"); + + return make_deep_unique( + std::move(temp), default_polymorphic_copy); +} + +template +deep_unique_ptr make_deep_unique(Args &&... args) +{ + static_assert(std::is_constructible::value, + "Cannot instantiate Derived from arguments"); + + return make_deep_unique( + std::move(Derived{ std::forward(args)... })); +} + +} // namespace memory +} // namespace utils +} // namespace arm_compute +#endif // ARM_COMPUTE_MISC_ITERABLE_H \ No newline at end of file diff --git a/tests/SConscript b/tests/SConscript index 62fa4fce11..87907f40fc 100644 --- a/tests/SConscript +++ b/tests/SConscript @@ -281,6 +281,20 @@ if test_env['benchmark_examples']: #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined']) arm_compute_benchmark_examples += [ prog ] + + # Dynamic fusion examples + if env['opencl']: + if env['experimental_dynamic_fusion']: + for file in Glob("%s/dynamic_fusion/*.cpp" % examples_folder): + example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0]) + if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']: + prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+[load_whole_archive, arm_compute_lib, noload_whole_archive] + bm_link_flags + extra_link_flags) + arm_compute_benchmark_examples += [ prog ] + else: + #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies + prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined']) + arm_compute_benchmark_examples += [ prog ] + arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples) Depends(arm_compute_benchmark_examples, arm_compute_test_framework) Depends(arm_compute_benchmark_examples, arm_compute_lib) diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp index 9e1b4d897b..a6b09ccdea 100644 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp @@ -21,9 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" #include "src/core/utils/helpers/float_ops.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" @@ -42,9 +45,12 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" + #include using namespace arm_compute::experimental::dynamic_fusion; +using namespace arm_compute::test::validation::utils; namespace arm_compute { @@ -52,149 +58,12 @@ namespace test { namespace validation { -namespace -{ -/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */ -#define TICK(clock_name) \ - auto clock_name##_tick = std::chrono::high_resolution_clock::now(); -#define TOCK(clock_name, measurement_map) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); -#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); - -template -void fill(U &&tensor, int seed) -{ - static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); - using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; - - DistributionType distribution{ T(-1.0f), T(1.0f) }; - library->fill(tensor, distribution, seed); - - // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) - DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; - library->fill_borders_with_garbage(tensor, distribution_inf, seed); -} -} // namespace - TEST_SUITE(CL) TEST_SUITE(UNIT) TEST_SUITE(DYNAMIC_FUSION) TEST_SUITE(ClCompositeKernel) TEST_SUITE(Validate) -TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) -{ - /* Computation: - * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast) - */ - const auto data_type = DataType::F32; - const auto m = 5U; - const auto n = 4U; - const auto k = 3U; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info }; - - ClKernelBlueprint bp; - ArgumentID tid_lhs; - ArgumentID tid_rhs; - ArgumentID tid_l0_bias = g_arg_placeholder; - ArgumentID tid_l1_addend; - ArgumentID tid_dst; - auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs); - st = add_tensor_argument(bp, t_rhs_desc, tid_rhs); - st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend); - st = add_tensor_argument(bp, t_dst_desc, tid_dst); - - const auto common_kernel_desc = ClKernelComponentDescriptor{}; - const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k }; - const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; - - ArgumentID tid_acc; - st = add_tensor_intermed(bp, tid_acc); - st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); - st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); - - ClKernelCode cl_code; - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - - ClExecutionDescriptor exec_desc{}; - st = tune_static(exec_desc, cl_code); - - CLScheduler::get().default_reinit(); - ClCompositeKernel kernel; - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - CLTensor t_dst{}; - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - t_dst.allocator()->init(t_dst_info); - } - // "Pack" tensors - TensorBinding tensors({ { tid_lhs, &t_lhs }, - { tid_rhs, &t_rhs }, - { tid_l1_addend, &t_l1_addend }, - { tid_dst, &t_dst } - }); - // Allocate and fill tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - - // Create reference - SimpleTensor ref_t_lhs{ t_lhs_shape, data_type, 1 }; - SimpleTensor ref_t_rhs{ t_rhs_shape, data_type, 1 }; - SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1 }; - SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1 }; - - // Fill reference - fill(ref_t_lhs, 0); - fill(ref_t_rhs, 1); - fill(ref_t_l1_addend, 2); - const auto ref_t_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend, - reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */), - data_type, - eltwise_add_desc.convert_policy); - - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32); -} - TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) { /* Computation: @@ -208,7 +77,7 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) Status st{}; const auto data_type = DataType::F32; - const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U); + const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ }; const auto width = 7U; const auto height = 6U; @@ -216,47 +85,44 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) const auto OFM = 4U; const auto kernel_sz = 3U; - const auto src_shape = TensorShape(IFM, width, height); - const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); - const auto bia_shape = TensorShape(OFM); - const auto dst_shape = TensorShape(OFM, width, height); + const auto src_shape = TensorShape(IFM, width, height); + const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); + const auto bia_shape = TensorShape(OFM); + const auto addend_shape = TensorShape(1, 1); + const auto dst_shape = TensorShape(OFM, width, height); - auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); - auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); - auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); - auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); - - const auto src_desc = ClTensorDescriptor(&src_info); - const auto wei_desc = ClTensorDescriptor(&wei_info); - const auto bia_desc = ClTensorDescriptor(&bia_info); - const auto addend_desc = ClTensorDescriptor(&dst_info); - const auto dst_desc = ClTensorDescriptor(&dst_info); + auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); + auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); + auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); + auto addend_info = TensorInfo(addend_shape, 1, data_type, DataLayout::NHWC); + auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); const auto n0 = std::min(OFM, 4u); const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U; - const ClKernelComponentDescriptor common_kernel_desc{}; - const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; + const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info }; + const ClEltwiseAddKernelDescriptor eltwise_add_desc{}; + const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; ArgumentID src_id{ g_arg_placeholder }; ArgumentID wei_id{ g_arg_placeholder }; ArgumentID bia_id{ g_arg_placeholder }; ArgumentID acc_id{ g_arg_placeholder }; + ArgumentID acc_1_id{ g_arg_placeholder }; ArgumentID addend_id{ g_arg_placeholder }; ArgumentID dst_id{ g_arg_placeholder }; - st = add_tensor_argument(bp, src_desc, src_id); - st = add_tensor_argument(bp, wei_desc, wei_id); - st = add_tensor_argument(bp, bia_desc, bia_id); - st = add_tensor_intermed(bp, acc_id); - st = add_tensor_argument(bp, addend_desc, addend_id); - st = add_tensor_argument(bp, dst_desc, dst_id); + st = add_tensor(bp, &src_info, src_id); + st = add_tensor(bp, &wei_info, wei_id); + st = add_tensor(bp, &bia_info, bia_id); + st = add_tensor(bp, &dst_info, acc_id); + st = add_tensor(bp, &dst_info, acc_1_id); + st = add_tensor(bp, &addend_info, addend_id); + st = add_tensor(bp, &dst_info, dst_id); - st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id); - st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect); + st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); + st = add_kcomp_eltwise_add(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id); + st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id); exec_desc.skip_sliding_window = true; @@ -282,12 +148,11 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) dst.allocator()->init(dst_info); // "Pack" tensors - TensorBinding tensors({ { src_id, &src }, + ITensorPack tensors{ { src_id, &src }, { wei_id, &wei }, { bia_id, &bia }, { addend_id, &addend }, - { dst_id, &dst } - }); + { dst_id, &dst } }; // Allocate and fill tensors src.allocator()->allocate(); @@ -296,10 +161,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) addend.allocator()->allocate(); dst.allocator()->allocate(); - fill(CLAccessor(src), 0); - fill(CLAccessor(wei), 1); - fill(CLAccessor(bia), 2); - fill(CLAccessor(addend), 3); + fill(CLAccessor(src), 0, library.get()); + fill(CLAccessor(wei), 1, library.get()); + fill(CLAccessor(bia), 2, library.get()); + fill(CLAccessor(addend), 3, library.get()); CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); @@ -310,10 +175,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) SimpleTensor ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; // Fill reference - fill(ref_src_nhwc, 0); - fill(ref_wei_nhwc, 1); - fill(ref_bia_nhwc, 2); - fill(ref_addend_nhwc, 3); + fill(ref_src_nhwc, 0, library.get()); + fill(ref_wei_nhwc, 1, library.get()); + fill(ref_bia_nhwc, 2, library.get()); + fill(ref_addend_nhwc, 3, library.get()); auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U)); auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U)); @@ -326,301 +191,25 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) const auto ref_dst = reference::arithmetic_operation( ArithmeticOperation::ADD, ref_addend, - reference::convolution_layer(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info), - data_type, - eltwise_add_desc.convert_policy); + reference::convolution_layer(ref_src, ref_wei, ref_bia, dst_shape_nchw, + PadStrideInfo + { + static_cast(conv_info.stride.x()), + static_cast(conv_info.stride.y()), + static_cast(conv_info.pad.left), + static_cast(conv_info.pad.top) }), + data_type, + ConvertPolicy::SATURATE); RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ validate(CLAccessor(dst), ref_dst, tolerance_f32); } TEST_SUITE_END() // Validate - -TEST_SUITE(Benchmark) -TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) -{ - using std::chrono::duration_cast; - using std::chrono::microseconds; - const int num_iterations = 200; - std::map measurements; - /* Computation: - * out = add(addend, gemm_native(lhs, rhs, bias)) - */ - const auto data_type = DataType::F32; - const auto m = 12U * 12U; - const auto n = 64U; - const auto k = 384U; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, data_type); - auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3 - auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - - const auto common_kernel_desc = ClKernelComponentDescriptor{}; - const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k }; - const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; - - // Create reference - SimpleTensor ref_t_lhs{ t_lhs_shape, data_type, 1 }; - SimpleTensor ref_t_rhs{ t_rhs_shape, data_type, 1 }; - SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1 }; - SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1 }; - - // Fill reference - fill(ref_t_lhs, 0); - fill(ref_t_rhs, 1); - fill(ref_t_l1_addend, 2); - const auto ref_t_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend, - reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */), - data_type, - eltwise_add_desc.convert_policy); - - CLScheduler::get().default_reinit(); - - /* Condition 0: Dynamic Fused Kernel */ - CLTensor cond0_t_dst{}; - { - TICK(cond0_0_startup_time); - - ClKernelBlueprint bp; - ArgumentID tid_lhs; - ArgumentID tid_rhs; - ArgumentID tid_l0_bias = g_arg_placeholder; - ArgumentID tid_l1_addend; - ArgumentID tid_dst; - - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info }; - - ClKernelCode cl_code; - TICK(cond0_build_time) - auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs); - st = add_tensor_argument(bp, t_rhs_desc, tid_rhs); - st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend); - st = add_tensor_argument(bp, t_dst_desc, tid_dst); - - ArgumentID tid_acc; - st = add_tensor_intermed(bp, tid_acc); - st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); - - st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - TOCK(cond0_build_time, measurements) - - TICK(cond0_tune_time) - ClExecutionDescriptor exec_desc{}; - st = tune_static(exec_desc, cl_code); - TOCK(cond0_tune_time, measurements) - - TICK(cond0_configure_time) - ClCompositeKernel kernel; - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - TOCK(cond0_configure_time, measurements) - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - cond0_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond0_t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } }); - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - CLScheduler::get().sync(); - TOCK(cond0_0_startup_time, measurements) - - TICK(cond0_1_latency) - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond0_1_latency, measurements, num_iterations) - } - /* Condition 1: Dynamic Unfused Kernel */ - /* Condition 2: Static Fused Kernel (current) */ - CLTensor cond2_t_dst{}; - { - TICK(cond2_0_startup_time); - arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm; - - TICK(cond2_configure_time); - experimental::PostOpList post_ops; - post_ops.push_back_op>(&t_dst_info, 1, eltwise_add_desc.convert_policy); - GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops }; - l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info, - gemm_native_desc.rhs_info, gemm_info); - TOCK(cond2_configure_time, measurements); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - cond2_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond2_t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - ITensorPack tensors - { - { ACL_SRC_0, &t_lhs }, - { ACL_SRC_1, &t_rhs }, - { EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend }, - { ACL_DST, &cond2_t_dst }, - }; - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true); - CLScheduler::get().sync(); - TOCK(cond2_0_startup_time, measurements); - - TICK(cond2_1_latency); - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond2_1_latency, measurements, num_iterations); - } - /* Condition 3: Static Unfused Kernel (current) */ - CLTensor cond3_t_dst{}; - { - TICK(cond3_0_startup_time); - arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm; - arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add; - - TICK(cond3_configure_time); - GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info, - gemm_native_desc.rhs_info, gemm_info); - l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy); - TOCK(cond3_configure_time, measurements); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l0_dst{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l0_dst.allocator()->init(t_l0_dst_info); - t_l1_addend.allocator()->init(t_dst_info); - cond3_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l0_dst.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond3_t_dst.allocator()->allocate(); - fill(CLAccessor(t_lhs), 0); - fill(CLAccessor(t_rhs), 1); - fill(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - ITensorPack tensors_l0 - { - { ACL_SRC_0, &t_lhs }, - { ACL_SRC_1, &t_rhs }, - { ACL_DST, &t_l0_dst }, - }; - ITensorPack tensors_l1 - { - { ACL_SRC_0, &t_l0_dst }, - { ACL_SRC_1, &t_l1_addend }, - { ACL_DST, &cond3_t_dst }, - }; - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true); - CLScheduler::get().enqueue_op(l1_add, tensors_l1, true); - CLScheduler::get().sync(); - TOCK(cond3_0_startup_time, measurements); - - TICK(cond3_1_latency); - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true); - CLScheduler::get().enqueue_op(l1_add, tensors_l1, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond3_1_latency, measurements, num_iterations); - } - - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - std::cout << "cond0 validation: " << std::endl; - validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32); - std::cout << "cond2 validation: " << std::endl; - validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32); - std::cout << "cond3 validation: " << std::endl; - validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32); - - /* Report */ - std::cout << "Performance comparison (gemm native + add)" << std::endl; - std::cout << "cond0: dynamic fusion module" << std::endl; - std::cout << "cond2: static fused with post ops" << std::endl; - std::cout << "cond3: static unfused" << std::endl; - for(auto m : measurements) - { - std::cout << m.first << ": " << m.second.count() << "us" << std::endl; - } -} -TEST_SUITE_END() // Benchmark TEST_SUITE_END() // ClCompositeKernel TEST_SUITE_END() // DYNAMIC_FUSION TEST_SUITE_END() // UNIT TEST_SUITE_END() // CL } // namespace validation } // namespace test -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) \ No newline at end of file +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp new file mode 100644 index 0000000000..6962f0e6d1 --- /dev/null +++ b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/experimental/DependencyGraph.h" + +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" + +using namespace arm_compute::experimental::dynamic_fusion; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(CL) + +TEST_SUITE(UNIT) +TEST_SUITE(DYNAMIC_FUSION) +TEST_SUITE(DependencyGraph) + +TEST_CASE(Correct_Graph_Creation_Should_Pass, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + const auto t4 = graph.add_tensor(); + + const auto o0 = graph.add_operator({ t0, t1 }, { t2 }).second; + const auto o1 = graph.add_operator({ t3, t2 }, { t4 }).second; + + ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_ops(), 2U, framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_tensors(), 5U, framework::LogLevel::ERRORS); + + const DependencyGraph ref_graph + { + { + // src_tensors + { o0, { t0, t1 } }, + { o1, { t3, t2 } }, + }, + { + // dst_tensors + { o0, { t2 } }, + { o1, { t4 } }, + }, + { + // src_ops + { t0, {} }, + { t1, {} }, + { t2, { o0 } }, + { t3, {} }, + { t4, { o1 } }, + }, + { + // dst_ops + { t0, { o0 } }, + { t1, { o0 } }, + { t2, { o1 } }, + { t3, { o1 } }, + { t4, {} }, + } + + }; + ARM_COMPUTE_EXPECT(graph == ref_graph, framework::LogLevel::ERRORS); +} + +TEST_CASE(Correct_Merge_Points_Should_Enable_Graph_Expansion, framework::DatasetMode::ALL) +{ + // Merge points are a simple way to collapse "graph of graphs" into a single graph + // Suppose we have a top-level graph g0 + DependencyGraph g0{}; + const auto g0_t0 = g0.add_tensor(); + const auto g0_t1 = g0.add_tensor(); + const auto g0_t2 = g0.add_tensor(); + const auto g0_t3 = g0.add_tensor(); + const auto g0_t4 = g0.add_tensor(); + g0.add_operator({ g0_t0, g0_t1 }, { g0_t2 }); // g0_o0 + g0.add_operator({ g0_t3, g0_t2 }, { g0_t4 }); // g0_o1 + + // Then g0 expands into g1, with additional nodes added in-between "merge point tensors" + // Note that the expansion logic may be local to each operator node + DependencyGraph g1{}; + // g0_o0 expands into g1_o0, g1_o1, g1_o2 + const auto g1_t0 = g1.add_tensor(g0_t0); + const auto g1_t1 = g1.add_tensor(g0_t1); + const auto g1_t2 = g1.add_tensor(); + const auto g1_t3 = g1.add_tensor(); + const auto g1_t4 = g1.add_tensor(g0_t2); + const auto g1_o0 = g1.add_operator({ g1_t0 }, { g1_t2 }).second; + const auto g1_o1 = g1.add_operator({ g1_t1 }, { g1_t3 }).second; + const auto g1_o2 = g1.add_operator({ g1_t2, g1_t3 }, { g1_t4 }).second; + + // g0_o1 expands into g1_o3 + const auto g1_t5 = g1.add_tensor(g0_t3); + const auto g1_t6 = g1.add_tensor(g0_t2); + const auto g1_t7 = g1.add_tensor(g0_t4); + ARM_COMPUTE_EXPECT_EQUAL(g1_t4, g1_t6, framework::LogLevel::ERRORS); // both associate with the same merge point g0_t2, thus they should point to the same tensor in g1 + const auto g1_o3 = g1.add_operator({ g1_t5, g1_t6 }, { g1_t7 }).second; + + const DependencyGraph ref_graph + { + { + // src_tensors + { g1_o0, { g1_t0 } }, + { g1_o1, { g1_t1 } }, + { g1_o2, { g1_t2, g1_t3 } }, + { g1_o3, { g1_t5, g1_t4 } }, + }, + { + // dst_tensors + { g1_o0, { g1_t2 } }, + { g1_o1, { g1_t3 } }, + { g1_o2, { g1_t4 } }, + { g1_o3, { g1_t7 } }, + }, + { + // src_ops + { g1_t0, {} }, + { g1_t1, {} }, + { g1_t2, { g1_o0 } }, + { g1_t3, { g1_o1 } }, + { g1_t4, { g1_o2 } }, + { g1_t5, {} }, + { g1_t7, { g1_o3 } }, + }, + { + // dst_ops + { g1_t0, { g1_o0 } }, + { g1_t1, { g1_o1 } }, + { g1_t2, { g1_o2 } }, + { g1_t3, { g1_o2 } }, + { g1_t4, { g1_o3 } }, + { g1_t5, { g1_o3 } }, + { g1_t7, {} }, + }, + { + // merge points + { g0_t0, g1_t0 }, + { g0_t1, g1_t1 }, + { g0_t2, g1_t4 }, + { g0_t3, g1_t5 }, + { g0_t4, g1_t7 }, + } + }; + ARM_COMPUTE_EXPECT(g1 == ref_graph, framework::LogLevel::ERRORS); +} + +TEST_CASE(Path_Existence_Check_0, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + const auto t4 = graph.add_tensor(); + const auto t5 = graph.add_tensor(); + const auto t6 = graph.add_tensor(); + const auto t7 = graph.add_tensor(); + const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second; + const auto o1 = graph.add_operator({ t3 }, { t5 }).second; + const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second; + const auto o3 = graph.add_operator({ t4 }, { t6 }).second; + const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second; + + ARM_COMPUTE_UNUSED(o1, o3); + + ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t3, o2)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t1, o4)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t2, o4)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t0, o2)), framework::LogLevel::ERRORS); + + ARM_COMPUTE_EXPECT((graph.path_exists_from_op_to_op(o0, o2)), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o0)), framework::LogLevel::ERRORS); + + ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o4)), framework::LogLevel::ERRORS); +} + +TEST_CASE(Correct_Topological_Sort_Should_Pass, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + const auto t4 = graph.add_tensor(); + const auto t5 = graph.add_tensor(); + const auto t6 = graph.add_tensor(); + const auto t7 = graph.add_tensor(); + const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second; + const auto o1 = graph.add_operator({ t3 }, { t5 }).second; + const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second; + const auto o3 = graph.add_operator({ t4 }, { t6 }).second; + const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second; + + const auto res = graph.topological_sort(); + ARM_COMPUTE_EXPECT(bool(res.first), framework::LogLevel::ERRORS); + std::vector ref_sorted_op_packs + { + { o0, { t1 }, { t3, t4 } }, + { o1, { t3 }, { t5 } }, + { o3, { t4 }, { t6 } }, + { o4, { t0, t5 }, { t2 } }, + { o2, { t5, t6 }, { t7 } }, + + }; + ARM_COMPUTE_EXPECT((res.second == ref_sorted_op_packs), framework::LogLevel::ERRORS); +} + +TEST_CASE(Cycles_Should_Fail, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + const auto t3 = graph.add_tensor(); + + graph.add_operator({ t0, t1 }, { t2 }); + graph.add_operator({ t2 }, { t1, t3 }); // Ideally error should occur here + + const auto res = graph.topological_sort(); + ARM_COMPUTE_EXPECT(!bool(res.first), framework::LogLevel::ERRORS); +} +TEST_CASE(Loops_Should_Fail, framework::DatasetMode::ALL) +{ + DependencyGraph graph{}; + const auto t0 = graph.add_tensor(); + const auto t1 = graph.add_tensor(); + const auto t2 = graph.add_tensor(); + + ARM_COMPUTE_EXPECT_THROW(graph.add_operator({ t0, t2 }, { t1, t2 }).first, framework::LogLevel::ERRORS); + ARM_COMPUTE_UNUSED(t0, t1, t2); +} +TEST_SUITE_END() // DependencyGraph +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // UNIT + +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp new file mode 100644 index 0000000000..1b04b0cee0 --- /dev/null +++ b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp @@ -0,0 +1,403 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ +#include "arm_compute/core/TensorInfo.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/experimental/ClWorkload.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/experimental/ClCompositeOperator.h" +#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" +#include "src/gpu/cl/operators/ClAdd.h" +#include "src/gpu/cl/operators/ClConv2d.h" +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" +#include "tests/validation/Validation.h" + +#include "tests/validation/reference/ConvolutionLayer.h" +#include "tests/validation/reference/ElementwiseOperations.h" +#include "tests/validation/reference/Permute.h" + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED +#include "tests/SimpleTensorPrinter.h" +#endif /* ARM_COMPUTE_ASSERTS_ENABLED */ + +using namespace arm_compute::experimental::dynamic_fusion; +using namespace arm_compute::test::validation::utils; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(CL) +TEST_SUITE(INTEGRATION) +TEST_SUITE(DYNAMIC_FUSION) +TEST_CASE(Operator_Fuse_Movenet_SubGraph_1_F32, framework::DatasetMode::ALL) +{ + // Please refer to: https://confluence.arm.com/pages/viewpage.action?pageId=886243697 + /* Computation: + * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const auto t_input_shape = TensorShape(384, 12, 12); + // const auto t_weight_shape = TensorShape(384, 1, 1, 64); + // const auto t_dst_shape = TensorShape(64, 12, 12); + const auto t_weight_shape = TensorShape(384, 1, 1, 16); + const auto t_dst_shape = TensorShape(16, 12, 12); + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_acc_info = TensorInfo(); // Intermediate tensor for cond3 + auto t_dst_info = TensorInfo(); + + Conv2dDescriptor conv2d_desc{}; + AddDescriptor add_desc{}; + + // Create reference + SimpleTensor ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; + + // Fill reference + fill(ref_t_input, 0, library.get()); + fill(ref_t_weight, 1, library.get()); + fill(ref_t_l1_addend, 2, library.get()); + + auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U)); + auto ref_t_weight_nchw = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U)); + auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U)); + auto ref_t_l1_addend_nchw = reference::permute(ref_t_l1_addend, PermutationVector(1U, 2U, 0U)); + auto t_dst_shape_nchw = t_dst_shape; + permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U)); + + PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); + auto ref_t_dst_nchw = reference::arithmetic_operation( + ArithmeticOperation::ADD, + ref_t_l1_addend_nchw, + reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation), + data_type, + ConvertPolicy{}); + const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U)); + + CLScheduler::get().default_reinit(); + const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + OperatorGraph op_graph; + + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); + const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); + add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + build(workload, op_graph, workload_ctx); + + ClCompositeOperator op; + op.configure(cl_compile_ctx, workload); + + // Construct tensors + CLTensor t_input{}; + CLTensor t_weight{}; + CLTensor t_l1_addend{}; + CLTensor t_dst{}; + + // Init tensors + t_input.allocator()->init(t_input_info); + t_weight.allocator()->init(t_weight_info); + t_l1_addend.allocator()->init(t_dst_info); + t_dst.allocator()->init(t_dst_info); + + // Allocate and fill tensors + t_input.allocator()->allocate(); + t_weight.allocator()->allocate(); + t_l1_addend.allocator()->allocate(); + t_dst.allocator()->allocate(); + fill(CLAccessor(t_input), 0, library.get()); + fill(CLAccessor(t_weight), 1, library.get()); + fill(CLAccessor(t_l1_addend), 2, library.get()); + // "Pack" tensors + OpTensorBinding bp_tensors({ { op_t_input, &t_input }, + { op_t_weight, &t_weight }, + { op_t_l1_addend, &t_l1_addend }, + { op_t_dst, &t_dst } + }); + + // Populate prepare and run pack-maps (including allocating aux tensors) + ClAuxTensorData aux_tensor_data{}; + TensorPackMap prepare_pack_map{}; + TensorPackMap run_pack_map{}; + bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); + + op.prepare(prepare_pack_map); + op.run(run_pack_map); + RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ + validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); +} +TEST_SUITE(Unsupported) +TEST_CASE(DataType_QASYMM8, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::QASYMM8; + const auto data_layout = DataLayout::NHWC; + const auto t_input_shape = TensorShape(384, 12, 12); + const auto t_weight_shape = TensorShape(384, 1, 1, 64); + const auto t_dst_shape = TensorShape(64, 12, 12); + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_acc_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + + Conv2dDescriptor conv2d_desc{}; + AddDescriptor add_desc{}; + + OperatorGraph op_graph; + + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); + const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc); + add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(DataLayout_NCHW, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NCHW; + const auto t_input_shape = TensorShape(384, 12, 12); + const auto t_weight_shape = TensorShape(384, 1, 1, 64); + const auto t_dst_shape = TensorShape(64, 12, 12); + auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); + auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); + auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); + + Conv2dDescriptor conv2d_desc{}; + + OperatorGraph op_graph; + + const auto op_t_input = add_tensor(op_graph, t_input_info); + const auto op_t_weight = add_tensor(op_graph, t_weight_info); + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_dst); + force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Unsupported + +TEST_SUITE(Invalid) +TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) +{ + /* Computation: + * out = conv2d(conv2d(l0_input, l0_weight), l1_weight) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const auto t_l0_input_shape = TensorShape(1024, 56, 56); + const auto t_l0_weight_shape = TensorShape(512, 1024, 1, 1); + const auto t_l1_weight_shape = TensorShape(512, 256, 1, 1); + + auto t_l0_input_info = TensorInfo(t_l0_input_shape, 1, data_type, data_layout); + auto t_l0_weight_info = TensorInfo(t_l0_weight_shape, 1, data_type, data_layout); + auto t_l1_weight_info = TensorInfo(t_l1_weight_shape, 1, data_type, data_layout); + auto t_l0_dst_info = TensorInfo(); + auto t_dst_info = TensorInfo(); + + OperatorGraph op_graph; + const auto conv2d_desc = Conv2dDescriptor{}; + + const auto op_t_l0_input = add_tensor(op_graph, t_l0_input_info); + const auto op_t_l0_weight = add_tensor(op_graph, t_l0_weight_info); + const auto op_t_l1_weight = add_tensor(op_graph, t_l1_weight_info); + const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_input, op_t_l0_weight, op_t_l0_dst); + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_dst, op_t_l1_weight, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(Enlarging_Execution_Space, framework::DatasetMode::ALL) +{ + /* Computation: + * out = add(l2_lhs, add(add(l0_lhs, l0_rhs), l1_rhs)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + const auto t_l0_lhs_shape = TensorShape(1, 256, 3); + const auto t_l0_rhs_shape = TensorShape(1, 256, 3); + const auto t_l1_rhs_shape = TensorShape(1, 1, 3); + const auto t_l2_lhs_shape = TensorShape(1024, 1, 3); + + auto t_l0_lhs_info = TensorInfo(t_l0_lhs_shape, 1, data_type, data_layout); + auto t_l0_rhs_info = TensorInfo(t_l0_rhs_shape, 1, data_type, data_layout); + auto t_l1_rhs_info = TensorInfo(t_l1_rhs_shape, 1, data_type, data_layout); + auto t_l2_lhs_info = TensorInfo(t_l2_lhs_shape, 1, data_type, data_layout); + auto t_l0_dst_info = TensorInfo(); + auto t_l1_dst_info = TensorInfo(); + auto t_dst_info = TensorInfo(); + + OperatorGraph op_graph; + const auto add_desc = AddDescriptor{}; + + const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info); + const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info); + const auto op_t_l1_rhs = add_tensor(op_graph, t_l1_rhs_info); + const auto op_t_l2_lhs = add_tensor(op_graph, t_l2_lhs_info); + const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + add_op_elementwise_add(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(Root_Simple_And_Complex, framework::DatasetMode::ALL) +{ + /* Computation: + * out = add(conv(l0_0_input, l0_0_weight), add(l0_1_lhs, l0_1_rhs)) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + + const auto t_l0_0_input_shape = TensorShape(128, 21, 21); + const auto t_l0_0_weight_shape = TensorShape(144, 128, 1, 1); + const auto t_l0_1_lhs_shape = TensorShape(144, 21, 21); + const auto t_l0_1_rhs_shape = TensorShape(1, 1, 21); + + auto t_l0_0_input_info = TensorInfo(t_l0_0_input_shape, 1, data_type, data_layout); + auto t_l0_0_weight_info = TensorInfo(t_l0_0_weight_shape, 1, data_type, data_layout); + auto t_l0_1_lhs_info = TensorInfo(t_l0_1_lhs_shape, 1, data_type, data_layout); + auto t_l0_1_rhs_info = TensorInfo(t_l0_1_rhs_shape, 1, data_type, data_layout); + auto t_l0_0_dst_info = TensorInfo(); + auto t_l0_1_dst_info = TensorInfo(); + auto t_dst_info = TensorInfo(); + + OperatorGraph op_graph; + const auto conv2d_desc = Conv2dDescriptor{}; + const auto add_desc = AddDescriptor{}; + + const auto op_t_l0_0_input = add_tensor(op_graph, t_l0_0_input_info); + const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info); + const auto op_t_l0_1_lhs = add_tensor(op_graph, t_l0_1_lhs_info); + const auto op_t_l0_1_rhs = add_tensor(op_graph, t_l0_1_rhs_info); + const auto op_t_l0_0_dst = add_tensor(op_graph, t_l0_0_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_l0_1_dst = add_tensor(op_graph, t_l0_1_dst_info); // temp accumulator; TensorInfo to be inferred + const auto op_t_dst = add_tensor(op_graph, t_dst_info); + + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst); + add_op_elementwise_add(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_CASE(Loop, framework::DatasetMode::ALL) +{ + /* Computation: + * tensor state0; + * state1 = add(l0_lhs, state0) + * state0 = add(l1_lhs, state1) + */ + const auto data_type = DataType::F32; + const auto data_layout = DataLayout::NHWC; + + const auto t_shape = TensorShape(13, 21); + + auto t_l0_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout); + auto t_l1_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout); + auto state0_info = TensorInfo(t_shape, 1, data_type, data_layout); + auto state1_info = TensorInfo(); + + OperatorGraph op_graph; + const auto conv2d_desc = Conv2dDescriptor{}; + const auto add_desc = AddDescriptor{}; + + const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info); + const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info); + const auto op_t_state0 = add_tensor(op_graph, state0_info); + const auto op_t_state1 = add_tensor(op_graph, state1_info); + + add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1); + add_op_elementwise_add(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0); + + const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; + ClWorkload workload; + const auto success = build(workload, op_graph, workload_ctx); + + ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Invalid + +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // INTEGRATION +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h new file mode 100644 index 0000000000..4512305c1e --- /dev/null +++ b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS +#define TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS + +#include "tests/AssetsLibrary.h" +#include "utils/Utils.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace utils +{ +/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name + * + */ +#define TICK(clock_name) \ + auto clock_name##_tick = std::chrono::high_resolution_clock::now(); +#define TOCK(clock_name, measurement_map) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); +#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); + +template +void fill(U &&tensor, int seed, AssetsLibrary *library) +{ + static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); + using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; + + DistributionType distribution{ T(-1.0f), T(1.0f) }; + library->fill(tensor, distribution, seed); + + // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) + DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; + library->fill_borders_with_garbage(tensor, distribution_inf, seed); +} +} // namespace utils +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif //TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS \ No newline at end of file -- cgit v1.2.1