From 31df05a1870662a7288fbaeb6fbc7fc458bb5a73 Mon Sep 17 00:00:00 2001 From: SiCong Li Date: Wed, 9 Nov 2022 15:57:48 +0000 Subject: Remove dynamic fusion prototype with tests and examples Public headers of the new experimental dynamic fusion can be found in arm_compute/dynamic_fusion/ New examples on how to use the interface can be found in tests/validation/dynamic_fusion/gpu/Integration.cpp Resolves COMPMID-5683 Change-Id: I7ccb902a227fb487562df15fc3c30118d1d95bbd Signed-off-by: SiCong Li Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8671 Reviewed-by: Jakub Sujak Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 13 - SConscript | 1 - arm_compute/core/experimental/ClWorkload.h | 219 ----- arm_compute/core/experimental/DependencyGraph.h | 277 ------ arm_compute/core/experimental/IWorkload.h | 132 --- arm_compute/core/experimental/OperatorGraph.h | 217 ----- arm_compute/runtime/CL/CLScheduler.h | 31 - arm_compute/runtime/CL/CLTuner.h | 4 - arm_compute/runtime/CL/ICLTuner.h | 18 - .../runtime/experimental/ClCompositeOperator.h | 190 ----- docs/Doxyfile | 3 +- .../cl_fused_conv2d_elementwise_add.cpp | 392 --------- .../cl_ref_conv2d_elementwise_add.cpp | 238 ------ scripts/clang_tidy_rules.py | 1 - src/core/CL/ICLKernel.h | 21 - .../dynamic_fusion/ClKernelBuildingAPI.cpp | 164 ---- .../dynamic_fusion/ClKernelBuildingAPI.h | 122 --- .../dynamic_fusion/ClKernelBuildingImpl/Common.h | 930 --------------------- .../dynamic_fusion/ClKernelBuildingImpl/Utils.h | 76 -- .../ClDirectConvolutionKernelComponent.cpp | 409 --------- .../ClDirectConvolutionKernelComponent.h | 81 -- .../components/ClElementwiseKernelComponent.cpp | 266 ------ .../components/ClElementwiseKernelComponent.h | 90 -- .../components/ClFloorKernelComponent.cpp | 153 ---- .../components/ClFloorKernelComponent.h | 85 -- .../components/ClKernelComponents.h | 35 - .../components/ClStoreKernelComponents.cpp | 171 ---- .../components/ClStoreKernelComponents.h | 97 --- .../experimental/dynamic_fusion/OperatorGraph.cpp | 264 ------ .../WorkloadImpl/ClFusedKernelGraph.cpp | 232 ----- .../WorkloadImpl/ClFusedKernelGraph.h | 452 ---------- .../WorkloadImpl/ClKernelDescriptors.h | 121 --- .../dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp | 271 ------ .../dynamic_fusion/WorkloadImpl/ClKernelGraph.h | 259 ------ .../dynamic_fusion/WorkloadImpl/ClWorkload.cpp | 72 -- .../WorkloadImpl/DependencyGraph.cpp | 430 ---------- .../dynamic_fusion/WorkloadImpl/ITensorDescPack.h | 241 ------ .../WorkloadImpl/OperatorGraphImpl.cpp | 423 ---------- .../WorkloadImpl/OperatorGraphImpl.h | 252 ------ .../dynamic_fusion/ClCompositeKernel.cpp | 200 ----- .../dynamic_fusion/ClCompositeKernel.h | 76 -- .../dynamic_fusion/ClCompositeOperator.cpp | 241 ------ src/runtime/CL/CLScheduler.cpp | 41 - src/runtime/CL/CLTuner.cpp | 32 - tests/SConscript | 1 - .../dynamic_fusion/ArbitraryElementwiseFusion.cpp | 394 --------- .../CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | 215 ----- .../CL/UNIT/dynamic_fusion/DependencyGraph.cpp | 266 ------ tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp | 135 --- .../Integration_OperatorFuseMovenetSubGraph1.cpp | 402 --------- tests/validation/CL/UNIT/dynamic_fusion/Utils.h | 71 -- tests/validation/dynamic_fusion/Utils.h | 73 ++ .../validation/dynamic_fusion/gpu/Integration.cpp | 2 +- 53 files changed, 75 insertions(+), 9527 deletions(-) delete mode 100644 arm_compute/core/experimental/ClWorkload.h delete mode 100644 arm_compute/core/experimental/DependencyGraph.h delete mode 100644 arm_compute/core/experimental/IWorkload.h delete mode 100644 arm_compute/core/experimental/OperatorGraph.h delete mode 100644 arm_compute/runtime/experimental/ClCompositeOperator.h delete mode 100644 examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp delete mode 100644 examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp delete mode 100644 src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h delete mode 100644 src/core/experimental/dynamic_fusion/OperatorGraph.cpp delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp delete mode 100644 src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h delete mode 100644 src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp delete mode 100644 src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h delete mode 100644 src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp delete mode 100644 tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp delete mode 100644 tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp delete mode 100644 tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp delete mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp delete mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp delete mode 100644 tests/validation/CL/UNIT/dynamic_fusion/Utils.h create mode 100644 tests/validation/dynamic_fusion/Utils.h diff --git a/Android.bp b/Android.bp index 89a7a43060..69a0affdb2 100644 --- a/Android.bp +++ b/Android.bp @@ -373,17 +373,6 @@ cc_library_static { "src/core/Utils.cpp", "src/core/Validate.cpp", "src/core/Version.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp", - "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp", - "src/core/experimental/dynamic_fusion/OperatorGraph.cpp", - "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp", - "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp", - "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp", - "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp", - "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp", "src/core/helpers/SoftmaxHelpers.cpp", "src/core/helpers/WindowHelpers.cpp", "src/core/utils/AssemblyUtils.cpp", @@ -667,7 +656,6 @@ cc_library_static { "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp", "src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigBifrost.cpp", "src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp", - "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp", "src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp", "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp", "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp", @@ -711,7 +699,6 @@ cc_library_static { "src/gpu/cl/operators/ClTranspose.cpp", "src/gpu/cl/operators/ClTransposedConvolution.cpp", "src/gpu/cl/operators/ClWinogradConv2d.cpp", - "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp", "src/runtime/Allocator.cpp", "src/runtime/BlobLifetimeManager.cpp", "src/runtime/BlobMemoryPool.cpp", diff --git a/SConscript b/SConscript index 42a03f0a04..908fbff626 100644 --- a/SConscript +++ b/SConscript @@ -507,7 +507,6 @@ lib_files = filelist['common'] # Dynamic fusion if env['experimental_dynamic_fusion']: lib_files += filelist['experimental']['dynamic_fusion'] - arm_compute_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION']) # Fixed format GEMM kernels. if env['experimental_fixed_format_kernels']: diff --git a/arm_compute/core/experimental/ClWorkload.h b/arm_compute/core/experimental/ClWorkload.h deleted file mode 100644 index 9b2040a046..0000000000 --- a/arm_compute/core/experimental/ClWorkload.h +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H - -#include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/GPUTarget.h" -#include "arm_compute/core/Window.h" - -#include "arm_compute/core/experimental/IWorkload.h" -#include "arm_compute/core/experimental/OperatorGraph.h" - -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -/** Verbose and explicit way to enumerate all the tensor arguments variants used by - * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed - */ -enum class ClKernelTensorArgType : int -{ - Scalar, - - Vector, - - Image, - Image_Reinterpret_As_3D, - Image_Export_To_ClImage2D, - - Image_3D, // 3D Tensor represented as a 2D Image + stride_z - Image_3D_Export_To_ClImage2D, - - Tensor_3D, - Tensor_4D, - Tensor_4D_t_Buffer, - Tensor_4D_t_Image -}; - -/** Describes all the info required to add a kernel argument at run time - * - * @note This struct can later be expanded into a more concise and formal way to specify how to set up - * arguments for a kernel inside a @ref ClUnitWorkload - */ -struct ClKernelArgDescriptor -{ - ClKernelArgDescriptor() = default; - ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true) - : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz } - { - } - ~ClKernelArgDescriptor() = default; - friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1) - { - return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz); - } - int arg_id{ -1 }; /**< Arg ID in the blueprint, -1 means empty / uninitialized */ - ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */ - bool slide_along_dimz{ true }; /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */ -}; - -using ClKernelArgList = std::map; - -/** Descriptor containing information required to run a single ClWorkload - */ -struct ClExecutionDescriptor -{ - cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */ - cl::NDRange gws{}; /**< Global work-group to be used */ - bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */ -}; - -/** Contains kernel code to be compiled and run in a ClUnitWorkload - */ -struct ClKernelCode -{ - friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1) - { - return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window) - && (code0.arguments == code1.arguments); - } - std::string name{}; /**< Kernel name */ - std::string code{}; /**< Kernel source code */ - std::string config_id{}; /**< Generated from blueprint based on complex component */ - CLBuildOptions build_options{}; /**< Kernel build options */ - Window window{}; /**< Execution window */ - ClKernelArgList arguments{}; /**< Kernel argument descriptors. map key is kernel ArgumentID */ -}; - -/** A descriptor of ClWorkload Tensors. - */ -struct ClWorkloadTensor : public WorkloadTensor -{ - ClWorkloadTensor() = default; - ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg) - : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg } - { - } - ClKernelArgDescriptor kernel_arg{}; - friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1) - { - return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg; - } -}; - -/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run. - */ -struct ClUnitWorkload : public UnitWorkload -{ - ClUnitWorkload() = default; - ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code) - : UnitWorkload{ id, stage }, code{ code } - { - } - friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1) - { - return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code; - } - ClKernelCode code{}; -}; - -/** GPU information for @ref ClWorkloadContext - */ -struct GpuInfo -{ - friend bool operator==(const GpuInfo &info0, const GpuInfo &info1) - { - return info0.target == info1.target; - } - GPUTarget target{ GPUTarget::UNKNOWN }; -}; - -/** Context (device capabilities, platform details) associated with a ClWorkload - * - * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers) - */ -struct ClWorkloadContext -{ - friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1) - { - return ctx0.gpu_info == ctx1.gpu_info; - } - GpuInfo gpu_info{}; -}; - -/** Workload for Cl backend - */ -struct ClWorkload : public IWorkload -{ - Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point) - { - Tid id = graph.add_tensor(merge_point); - if(tensors.find(id) == tensors.end()) - { - tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg); - } - return id; - } - UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector &inputs, const std::vector &outputs) - { - auto op = graph.add_operator(inputs, outputs); - auto id = op.second; - unit_workloads[id] = ClUnitWorkload(id, stage, code); - return id; - } - friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1) - { - return std::make_tuple( - workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut) - == std::make_tuple( - workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut); - } - ClWorkloadContext context{}; /**< Workload context*/ - std::map unit_workloads{}; /**< Unit workloads to run*/ - std::map tensors{}; /**< Workload tensors*/ - std::map op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/ - Status status{}; /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */ -}; - -/** Build a @ref ClWorkload from an @ref OperatorGraph. - * - * @param[out] workload - * @param[in] op_graph - * @param[in] ctx - * @return Status - */ -Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx); - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute - -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/arm_compute/core/experimental/DependencyGraph.h b/arm_compute/core/experimental/DependencyGraph.h deleted file mode 100644 index e0d6ff9ba9..0000000000 --- a/arm_compute/core/experimental/DependencyGraph.h +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H - -#include "arm_compute/core/Error.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -template -bool is_in(const T &v, const std::vector &vec) -{ - return std::find(std::begin(vec), std::end(vec), v) != std::end(vec); -} - -/** The dependency graph of a workload, where the nodes are of 2 types: Tensor or Operator - * Represented as a doubly-linked adjacency list with the differentiation between source and destination - * - * A "Merge Tensor" is an external tensor associated with the tensor within the graph, and serve as a merge point - */ -class DependencyGraph -{ -public: - /** A serial Id allocator - * - */ - class SerialIdAllocator - { - public: - using Id = int; - Id alloc() - { - return _counter++; - } - constexpr static Id empty() - { - return -1; - } - - private: - Id _counter{ 0 }; - }; - using Id = SerialIdAllocator::Id; - /** Adjacency list - * - */ - using AdjList = std::map>; - - /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order - * - */ - struct OpPack - { - Id op{}; - std::vector inputs{}; - std::vector outputs{}; - friend bool operator==(const OpPack &opp0, const OpPack &opp1) - { - return std::make_tuple( - opp0.op, opp0.inputs, opp0.outputs) - == std::make_tuple( - opp1.op, opp1.inputs, opp1.outputs); - } - }; - -public: - constexpr static Id empty_id() - { - return SerialIdAllocator::empty(); - } - - DependencyGraph() = default; - // Used in cases where two DependencyGraphs may want to share the same configuration of tensors - explicit DependencyGraph(const std::vector &imported_tensors); - // Testing only - DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map merge_points = {}); - - /** Add a new tensor - * - * @param merge_tensor The external merge point associated with the tensor. Leave empty if not needed. - * @return Id The newly allocated tensor, or a previously added tensor associated with @p merge_tensor - */ - Id add_tensor(Id merge_tensor = empty_id()); - - void remove_tensor(Id tensor); - - /** Add a new operator - * - * @param inputs Input tensors to the operator - * @param outputs Output tensors to the operator - * @return std::pair where id is the newly allocated operator - */ - std::pair add_operator(const std::vector &inputs, const std::vector &outputs); - - void remove_operator(Id op); - /** Sort the graph in a topological order - * - * @return std::pair> - */ - std::pair> topological_sort() const; - - std::vector src_ops(Id op) const; - std::vector dst_ops(Id op) const; - - std::vector src_ops_from_tensor(Id tensor) const; - std::vector dst_ops_from_tensor(Id tensor) const; - /** Get the merge points object - * - * @return std::map - */ - std::map get_merge_points() const; - /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph - * - * @return std::vector - */ - std::vector get_root_ops() const; - /** Get all dst ops of the whole graph - * - * @return std::vector - */ - std::vector get_dst_ops() const; - - /** Get source tensors to an operator - * - * @param op - * @return std::vector - */ - std::vector src_tensors(Id op) const; - /** Get destination tensors to an operator - * - * @param op - * @return std::vector - */ - std::vector dst_tensors(Id op) const; - /** Get source tensors of the whole graph - * - * @return std::vector - */ - std::vector src_tensors() const; - /** Get destination tensors of the whole graph - * - * @return std::vector - */ - std::vector dst_tensors() const; - /** Get all operators - * - * @return std::vector - */ - std::vector all_ops() const; - /** Get all tensors - * - * @return std::vector - */ - std::vector all_tensors() const; - /** Number of operators - * - * @return unsigned int - */ - unsigned int number_of_ops() const; - /** Number of tensors - * - * @return unsigned int - */ - unsigned int number_of_tensors() const; - - /** Update @p merge_point to point to @p t_id - * - * @param t_id - * @param merge_point - */ - Status update_merge_point(Id t_id, Id merge_point); - - /** Strict equality comparison (all internal ids and order of insertion matter). - * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal - * - * - * @param g0 - * @param g1 - * @return true - * @return false - */ - friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1) - { - // Do not compare id allocators - return std::make_tuple( - g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops, g0._merge_to_internal) - == std::make_tuple( - g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops, g1._merge_to_internal); - } - void link_input(Id op, Id in_tensor); - void link_output(Id op, Id out_tensor); - /** Check if there's a path from @p src_tensor to @p dst_op - * - * @param src_tensor - * @param dst_op - * @return true - * @return false - */ - bool path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const; - /** Check if there's a path from @p src_op to @p dst_op - * - * @param src_op - * @param dst_op - * @return true - * @return false - */ - bool path_exists_from_op_to_op(Id src_op, Id dst_op) const; - /** Check if tensor is the src tensor of the entire graph - * - * @param tensor - * @return true - * @return false - */ - bool is_src_tensor(Id tensor) const; - /** Check if tensor is the dst tensor of the entire graph - * - * @param tensor - * @return true - * @return false - */ - bool is_dst_tensor(Id tensor) const; - -private: - Id insert_new_tensor(); - Id insert_new_op(); - bool tensor_exists(Id tensor) const; - bool operator_exists(Id op) const; - bool is_src_tensor_of(Id op, Id tensor) const; - bool is_dst_tensor_of(Id op, Id tensor) const; - bool are_connected(Id op, Id tensor) const; - -private: - AdjList _adj_src_tensors{}; - AdjList _adj_dst_tensors{}; - AdjList _adj_src_ops{}; - AdjList _adj_dst_ops{}; - std::map _merge_to_internal{}; // From merge tensor to internal tensor - SerialIdAllocator _operator_id{}; - SerialIdAllocator _tensor_id{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute - -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/arm_compute/core/experimental/IWorkload.h b/arm_compute/core/experimental/IWorkload.h deleted file mode 100644 index 54855c1084..0000000000 --- a/arm_compute/core/experimental/IWorkload.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" - -#include "arm_compute/core/experimental/DependencyGraph.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -/** Describes when a Unit Workload is run. - * - */ -struct UnitWorkloadStage -{ - enum class Stage - { - Prepare, /**< Only run once at the beginning. */ - Run, /**< Run every time after the first time. */ - }; - Stage stage; - friend bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1) - { - return stage0.stage == stage1.stage; - } -}; -/** Type of memory used by a Workload Tensor - * - */ -enum class MemoryType -{ - Core = 0, /**< Core memory used by the Workload Tensor, e.g. for argument tensors */ - Auxiliary = 1, /**< Auxiliary memory required by the Workload Tensor, e.g. for temporary tensors */ -}; - -using AuxMemoryLifetime = MemoryLifetime; - -/** Memory Info for a @ref WorkloadTensor of Auxiliary memory type. This communicates to the user how much additional - * memory is required for auxiliary tensors - */ -struct AuxMemoryInfo -{ - AuxMemoryInfo() = default; - - AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept - : size(size), - alignment(alignment) - { - } - - AuxMemoryInfo(AuxMemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept - : lifetime(lifetime), - size(size), - alignment(alignment) - { - } - friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1) - { - return info0.lifetime == info1.lifetime && info0.size == info1.size && info0.alignment == info1.alignment; - } - - AuxMemoryLifetime lifetime{ AuxMemoryLifetime::Temporary }; /**< Memory lifetime*/ - size_t size{ 0 }; /**< Total memory size in bytes */ - size_t alignment{ 64 }; /**< Memory alignment in bytes */ -}; - -/** A descriptor for IWorkload Tensors. - */ -struct WorkloadTensor -{ - using Id = DependencyGraph::Id; - Id id{}; /**< Id of the workload tensor */ - ITensorInfo *info{}; /**< TensorInfo associated with the workload tensor */ - MemoryType memory_type{}; /**< Memory type */ - AuxMemoryInfo memory_info{}; /**< Auxiliary memory information. This can be ignored if the memory type is Core */ -}; -/** The basic atomic unit in an @ref IWorkload. It contains exactly one kernel to run. - * - */ -struct UnitWorkload -{ - using Id = DependencyGraph::Id; - Id id{}; /**< Id of the unit workload */ - UnitWorkloadStage stage{}; /**< Stage */ -}; - -/** Run-time-agnostic, platform-specific graph that describes everything required to run a workload - * It can be configured into an Arm Compute Library runtime, integrated into the runtime of another framework, or integrated into the compilation flow - */ -struct IWorkload -{ - using UnitWorkId = UnitWorkload::Id; - using Tid = WorkloadTensor::Id; - IWorkload() = default; - virtual ~IWorkload() = default; - DependencyGraph graph{}; /**< Dependency graph of the workload tensors and the unit workloads */ -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/arm_compute/core/experimental/OperatorGraph.h b/arm_compute/core/experimental/OperatorGraph.h deleted file mode 100644 index f40ad0d8c5..0000000000 --- a/arm_compute/core/experimental/OperatorGraph.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensorInfo.h" - -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -/** Graph of operators to execute within a Workload. This is a pure descriptive construct. - */ -class OperatorGraph final -{ -public: - struct Implementation; - OperatorGraph(); - ~OperatorGraph(); - -public: - Implementation *impl(); - const Implementation *impl() const; - -private: - std::unique_ptr _impl; -}; - -/** Return the validity of @p op_graph, usually after performing an operation (e.g. add_tensor) on it - * - * @param[in,out] op_graph OperatorGraph to be validated - * - * @return Status - */ -Status validate(const OperatorGraph &op_graph); - -/** Operator Tensor Handle - * This can be either an argument tensor, or an intermediate tensor linking 2 @ref Operator s - */ -class OpTensor final -{ -public: - using Id = int; - OpTensor(Id id = {}); - /** Id of the OpTensor - * @return Id - */ - Id id() const; - -private: - Id _id{}; -}; - -/** Provide order of @ref OpTensor by checking if @p t0 is "lower than" @p t1 - * - * @param[in] t0 OpTensor - * @param[in] t1 OpTensor - * - * @return true if @p t0 is lower than @p t1 - * @return false otherwise - */ -bool operator<(const OpTensor &t0, const OpTensor &t1); - -/** Associate a TensorInfo with a newly created @ref OpTensor in the @p graph. - * - * @note @p info needs to remain in scope and valid until the workload has finished building - * @note Can pass in an empty TensorInfo for a destination Tensor, in which case @p info will be inferred from the source tensors - * - * @param[in,out] graph OperatorGraph where the tensor is added - * @param[in] info TensorInfo to be associated - * - * @return OpTensor - */ -OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info); - -/** Operator Handle - * This can be used to further modify an existing operator - */ -class Operator final -{ -public: - using Id = int; - Operator(Id id = {}); - /** Id of the Operator - * @return Id - */ - Id id() const; - -private: - Id _id{}; -}; - -/** Provide order of @ref Operator by checking if @p op0 is "lower than" @p op1 - * - * @param[in] op0 Operator - * @param[in] op1 Operator - * - * @return true if @p op0 is lower than @p op1 - * @return false otherwise - */ -bool operator<(const Operator &op0, const Operator &op1); - -/** Descriptor for Conv2dDescriptor operation - */ -struct Conv2dDescriptor -{ - /* TOSA compliant attribute parameters start */ - Padding2D pad{}; - Size2D stride{ 1U, 1U }; - Size2D dilation{ 1U, 1U }; - /* TOSA compliant attribute parameters end */ - /* Non-TOSA compliant attribute parameters start */ - /* Non-TOSA compliant attribute parameters end */ -}; -/** Add op Conv2d to @p graph - * - * @param[in,out] graph OperatorGraph where the operator is added to - * @param[in] desc Operator descriptor - * @param[in] input Input OpTensor - * @param[in] weights Weights OpTensor - * @param[in] bias (Optional) bias OpTensor - * @param[in] dst Destination OpTensor - * - * @return Operator - */ -Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst); -Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst); -/** (Only for Debuging and Testing) Force a conv2d method - * - * @param[in,out] graph OperatorGraph where conv2d op is located - * @param[in] conv2d Conv2d Op - * @param[in] method Forced ConvolutionMethod - */ -void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method); - -/** Descriptor for Elementwise binary operation - * - */ -struct ElementwiseDescriptor -{ - /* TOSA compliant attribute parameters start */ - /* TOSA compliant attribute parameters end */ - /* Non-TOSA compliant attribute parameters start */ - ArithmeticOperation op; - /* Non-TOSA compliant attribute parameters end */ -}; -/** Add op Elementwise to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s - * - * @param[in,out] graph OperatorGraph where the operator is added to - * @param[in] desc Operator descriptor - * @param[in] lhs Lhs OpTensor - * @param[in] rhs Rhs OpTensor - * @param[in] dst Destination OpTensor - * - * @return Operator - */ -Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst); - -/** Descriptor for Floor operation - * - */ -struct FloorDescriptor -{ - /* TOSA compliant attribute parameters start */ - /* TOSA compliant attribute parameters end */ - /* Non-TOSA compliant attribute parameters start */ - /* Non-TOSA compliant attribute parameters end */ -}; -/** Add op Floor to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s - * - * @param[in,out] graph OperatorGraph where the operator is added to - * @param[in] desc Operator descriptor - * @param[in] src Source OpTensor - * @param[in] dst Destination OpTensor - * - * @return Operator - */ -Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst); - -bool operator==(const OpTensor &t0, const OpTensor &t1); -bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1); -bool operator==(const ElementwiseDescriptor &, const ElementwiseDescriptor &); -bool operator==(const FloorDescriptor &, const FloorDescriptor &); - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h index 3919635d1b..3030239270 100644 --- a/arm_compute/runtime/CL/CLScheduler.h +++ b/arm_compute/runtime/CL/CLScheduler.h @@ -35,19 +35,6 @@ #include "arm_compute/runtime/CL/CLTypes.h" #include "arm_compute/runtime/CL/ICLTuner.h" -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClExecutionDescriptor; -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - namespace arm_compute { class ICLKernel; @@ -108,20 +95,6 @@ public: * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled. */ void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush = true); - -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - - /** Schedule the execution of the passed kernel if possible. - * - * @param[in] kernel Kernel to execute. - * @param[in] tensors Map containing the tensors to operate on. - * @param[in] exec_desc Execution descriptor - * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled. - */ - void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush = true); - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - /** Initialises the context and command queue to be used by the scheduler. * * @param[in] context A CL context. @@ -214,10 +187,6 @@ private: */ void flush_queue(bool flush); -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush); -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - /** Flag to ensure symbols initialisation is happening before Scheduler creation */ static std::once_flag _initialize_symbols; diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h index 88933fc2d8..93aa45adc1 100644 --- a/arm_compute/runtime/CL/CLTuner.h +++ b/arm_compute/runtime/CL/CLTuner.h @@ -124,10 +124,6 @@ public: void tune_kernel_static(ICLKernel &kernel) override; void tune_kernel_dynamic(ICLKernel &kernel) override; void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override; -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override; -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - /** Is the kernel_event set ? * * @return true if the kernel_event is set. diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h index e0ee3ffe71..fa7a1424b8 100644 --- a/arm_compute/runtime/CL/ICLTuner.h +++ b/arm_compute/runtime/CL/ICLTuner.h @@ -30,15 +30,6 @@ namespace arm_compute { class ICLKernel; -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClExecutionDescriptor; -} // namespace dynamic_fusion -} // namespace experimental -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Basic interface for tuning the OpenCL kernels */ class ICLTuner { @@ -66,15 +57,6 @@ public: * @param[in, out] tensors Tensors for the kernel to use */ virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) = 0; -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - /** Tune OpenCL kernel dynamically for dynamic fusion interface - * - * @param[in] kernel Kernel to tune - * @param[in, out] tensors Tensors for the kernel to use - * @param[in] exec_desc Execution descriptor - */ - virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0; -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) }; } // namespace arm_compute #endif /*ARM_COMPUTE_ICLTUNER_H */ diff --git a/arm_compute/runtime/experimental/ClCompositeOperator.h b/arm_compute/runtime/experimental/ClCompositeOperator.h deleted file mode 100644 index 827629bd82..0000000000 --- a/arm_compute/runtime/experimental/ClCompositeOperator.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H - -#include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IOperator.h" - -#include "arm_compute/core/experimental/ClWorkload.h" - -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -/** Map OpTensor handles to their corresponding ITensor memory - */ -using OpTensorBinding = std::map; - -/** Map a kernel (as identified by its unit workload id) to its corresponding tensor pack - * - * @note External user should not use the add_tensor_pack method to alter this tensor pack map, and should only use the map returned by @ref bind_tensors - */ -class TensorPackMap -{ -public: - /** Find a tensor pack associated with the unit workload Id @p uwk_id - * - * @param[in] uwk_id unit workload Id associated with the tensor pack - * - * @return ITensorPack* - */ - ITensorPack *find_tensor_pack(UnitWorkload::Id uwk_id); - /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found. - * - * @param[in] uwk_id unit workload Id associated with the tensor pack - * - * @return ITensorPack* - */ - ITensorPack &get_tensor_pack(UnitWorkload::Id uwk_id); - /** Add a tensor pack and associate it with unit workload Id @p uwk_id - * @note Should not be used by external user - * - * @param[in] uwk_id unit workload Id associated with the tensor pack - * @param[in] tensor_pack Tensor Pack to be added - */ - void add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack); - -private: - std::map _tensor_packs{}; -}; - -/** Holder of any auxiliary CLTensors required by a ClWorkload. - * - * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the TensorInfo and AuxMemoryInfo - * - * @note This data holder must remain valid until the ClCompositeOperator that it's passed to is out of scope - * - * @note External user should not use the add_aux_tensor method, and should only use the data returned by @ref bind_tensors - */ -class ClAuxTensorData -{ -public: - /** A view of a single auxiliary data and the associated TensorInfo and AuxMemoryInfo - */ - struct DataView - { - DataView() = default; - DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info) - : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info } - { - } - ~DataView() = default; - DataView(const DataView &other) = default; - DataView &operator=(const DataView &other) = default; - DataView(DataView &&other) = default; - DataView &operator=(DataView &&other) = default; - CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */ - TensorInfo tensor_info{}; /**< Associated TensorInfo */ - AuxMemoryInfo memory_info{}; /**< Memory requirement */ - }; - - /** Add auxiliary tensor. - * - * @note Should not be used by external user - * - * @param[in] tensor_id Any Id that can uniquely identify an auxiliary tensor. Usually ClWorkloadTensor Id - * @param[in] tensor_info TensorInfo associated with the tensor - * @param[in] memory_info Memory requirements - * - * @return CLTensor* if successfully added, otherwise nullptr - */ - CLTensor *add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info); - - /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. - * - * @return std::vector& - */ - std::vector &get_tensors(); - -private: - std::map> _owned_tensors{}; - std::vector _tensors{}; -}; - -/** Bind tensor memory to packs used by prepare and run methods. Create auxiliary tensor objects and their memory requirements if needed - * - * @note This is the only method for external user to create ClAuxTensorData, and the prepare and run TensorPackMaps - * - * @param[out] aux_tensor_data Auxiliary Tensors required by the workload - * @param[out] prepare_pack_map TensorPackMap used by the prepare method - * @param[out] run_pack_map TensorPackMap used by the run method - * @param[in] workload ClWorkload to bind the tensors to - * @param[in] op_tensors CLTensor memory objects mapped from Core OpTensors - * - * @return Status - */ -Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors); - -/** Operator runtime to run a @ref ClWorkload - * - * @note User must explicitly call prepare before run otherwise run will fail. - * - */ -class ClCompositeOperator -{ -public: - ClCompositeOperator(); - ~ClCompositeOperator(); - /** Configures a @ref ClCompositeOperator with a @ref ClWorkload - * This includes the compilation of Cl kernels inside the @ref ClWorkload - * - * @param[in] ctx CLCompileContext - * @param[in] workload ClWorkload to configure with - */ - void configure(const CLCompileContext &ctx, const ClWorkload &workload); - /** Validate ClWorkload @p workload - * - * @param[in] workload ClWorkload to be validated - * - * @return Status - */ - static Status validate(const ClWorkload &workload); - /** Enqueue prepare workloads - * - * @param tensor_pack_map Tensors required by the prepare workloads - */ - void prepare(TensorPackMap &tensor_pack_map); - /** Enqueue run workloads - * - * @param tensor_pack_map Tensors required by the run workloads - */ - void run(TensorPackMap &tensor_pack_map); - -private: - struct Implementation; - std::unique_ptr _impl; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/docs/Doxyfile b/docs/Doxyfile index da637abd3e..641ca4f57f 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -2097,8 +2097,7 @@ PREDEFINED = DOXYGEN_SKIP_THIS \ LOCATE_MIN \ LOCATE_MAX \ HAS_BIAS \ - POOL_AVG \ - ENABLE_EXPERIMENTAL_DYNAMIC_FUSION + POOL_AVG # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp deleted file mode 100644 index afbc55777b..0000000000 --- a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp +++ /dev/null @@ -1,392 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/// @example dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp -/// @copybrief example_dynamic_fusion_cl_conv2d_elementwise_add -/// -/// @page example_dynamic_fusion_cl_conv2d_elementwise_add Dynamic Fusion Example: Conv2d + Elementwise Addition (OpenCL target) -/// This example demonstrates how to fuse a Conv2d with an Addition using the new OperatorGraph API, and to run it with the Async Composite Operator - -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */ -#error "This example needs to be built with -DARM_COMPUTE_CL" -#endif /* ARM_COMPUTE_CL */ - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/ClWorkload.h" -#include "arm_compute/core/experimental/OperatorGraph.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/experimental/ClCompositeOperator.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "utils/TypePrinter.h" - -#include "utils/Utils.h" - -#include - -using namespace arm_compute; -using namespace utils; -using namespace arm_compute::experimental::dynamic_fusion; - -#define TICK(clock_name) \ - auto clock_name##_tick = std::chrono::high_resolution_clock::now(); -#define TOCK(clock_name, measurement_map) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); -#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); - -using std::chrono::duration_cast; -using std::chrono::microseconds; - -class ClFusedConv2dEltwiseAddExample : public Example -{ -public: - bool do_setup(int argc, char **argv) override - { - size_t ih; - size_t iw; - size_t ifm; - size_t wh; - size_t ww; - size_t ofm; - size_t tuner_choice; - unsigned int pad_x; - unsigned int pad_y; - if(argc < 10) - { - // Print help - std::cout << "Usage: ./cl_fused_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n"; - std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n"; - ih = 512; - iw = 512; - ifm = 64; - wh = 1; - ww = 1; - ofm = 3; - tuner_choice = 2; - pad_x = 0; - pad_y = 0; - } - else - { - ih = strtol(argv[1], nullptr, 10); - iw = strtol(argv[2], nullptr, 10); - ifm = strtol(argv[3], nullptr, 10); - wh = strtol(argv[4], nullptr, 10); - ww = strtol(argv[5], nullptr, 10); - ofm = strtol(argv[6], nullptr, 10); - tuner_choice = strtol(argv[7], nullptr, 10); - pad_x = strtol(argv[8], nullptr, 10); - pad_y = strtol(argv[9], nullptr, 10); - } - - CLTuner *tuner_to_use; - switch(tuner_choice) - { - case 0: - { - tuner_to_use = nullptr; - break; - } - case 1: - { - tuner.set_tuner_mode(CLTunerMode::RAPID); - tuner_to_use = &tuner; - break; - } - case 3: - { - tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE); - tuner_to_use = &tuner; - break; - } - case 2: - default: - { - tuner.set_tuner_mode(CLTunerMode::NORMAL); - tuner_to_use = &tuner; - break; - } - } - CLScheduler::get().default_init(tuner_to_use); - - TICK(startup_time); - TICK(configure); - /* Computation: - * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - - const auto t_input_shape = TensorShape(ifm, iw, ih); - const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); - const auto t_bias_shape = TensorShape(ofm); - const auto t_l1_addend_shape = TensorShape(ofm, iw); - - std::cout << "input_shape: " << t_input_shape << std::endl; - std::cout << "weight_shape: " << t_weight_shape << std::endl; - std::cout << "bias_shape: " << t_bias_shape << std::endl; - std::cout << "addend_shape: " << t_l1_addend_shape << std::endl; - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// @section describe_workload_using_operator_graph Describe the workload to run using OperatorGraph - /// OperatorGraph is a graph of Tensors and Operators. Let's first default-construct it - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct OperatorGraph - // [Construct OperatorGraph] - OperatorGraph op_graph; - // [Construct OperatorGraph] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// @subsection add_conv2d Add the first operator (root operator) Conv2d - /// The first operator to be added to the graph is called the "root operator" of the entire graph. - /// @note As of now, operators need to be inserted according to their dependency order. This is because output tensor auto-initialization occurs during construction time. - /// Later this might be changed to allow out-of-order insertion. - - /// Before we insert the operator, we need to initialize the required TensorInfo objects. - /// We can choose not to initialize an output TensorInfo; if so, they will be auto-initialized during the construction of the OperatorGraph - /// The "t_acc_info" is the TensorInfo of the accumulator tensor, which is the output tensor of our first operator conv2d - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize Conv2d TensorInfo - // [Initialize Conv2d TensorInfo] - auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); - auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); - auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout); - auto t_acc_info = TensorInfo(); - // [Initialize Conv2d TensorInfo] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// Next we associate the TensorInfo with the OpTensor s created in the op_graph. - /// @note The associated TensorInfo objects must be in scope and remain valid until the ClWorkload building is completed - - /// @note The associated TensorInfo objects must be declard as non-const, since they may be updated during the OperatorGraph construction - - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add OpTensors - // [Add OpTensors] - const auto op_t_input = add_tensor(op_graph, t_input_info); - const auto op_t_weight = add_tensor(op_graph, t_weight_info); - const auto op_t_bias = add_tensor(op_graph, t_bias_info); - const auto op_t_acc = add_tensor(op_graph, t_acc_info); - // [Add OpTensors] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// Finally we add the Conv2d operator to op_graph. The Conv2dDescriptor contains all the TOSA-compliant attribute parameters - /// The add_op... group of functions accept the OpTensors created by the add_tensor function, and return an Operator handle. - /// This handle can be used to further query and modify the operator inside the OperatorGraph after its creation - /// For example, here we use the handle to force the ConvolutionMethod to be Direct Convolution - /// @note The force_conv2d_method is only for debug purpose for now, as the end user is not expected to decide on the ConvolutionMethod - - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Conv2d Operator - // [Add Conv2d Operator] - Conv2dDescriptor conv2d_desc{ Padding2D{ pad_x, pad_x, pad_y, pad_y } }; - auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_bias, op_t_acc); - force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); // Only for debug purposes - // [Add Conv2d Operator] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// @subsection add_elementwise_add Add the second operator Elementwise Add - /// This is similar to adding the first operator to op_graph, except that we link the two operators together by their common tensor, - /// namely the accumulator tensor op_t_acc, which is the output of conv2d and the input (lhs) of the addition - /// @note At the moment, it is recommended to always declare a separate TensorInfo (even if empty) for each OpTensor. - /// For example, here op_t_dst could be associated with op_t_acc info as they are the same, - /// but we still recommend creating a separate object. - - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Elementwise Add Operator - // [Add Elementwise Add Operator] - auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout); - auto t_dst_info = TensorInfo(); - const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD }; - add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); - // [Add Elementwise Add Operator] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// @section build_clworkload Build ClWorkload - /// ClWorkload is an intermediate object which contains all the built kernel codes and all other descriptors on how to schedule them - /// We build ClWorkload from the op_graph object that we just described - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Build ClWorkload - // [Build ClWorkload] - const ClWorkloadContext workload_ctx - { - GpuInfo{ CLScheduler::get().target() } - }; - ClWorkload workload; - build(workload, op_graph, workload_ctx); - // [Build ClWorkload] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// @section run_fused_op_with_clcompositeoperator Run the fused operator workload with ClCompositeOperator - /// @subsection configure_and_validate_clcompositeoperator Validate ClWorkload and Configure ClCompositeOperator - /// After ClWorkload is built, we need to configure it with the Compute Library runtime ClCompositeOperator to run it. - /// Optionally we can explicitly validate the workload to check if the workload has been built successfully. - /// The validate is automatically run inside configure and would throw if it fails. - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClCompositeOperator - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Validate and configure ClCompositeOperator - // [Validate and configure ClCompositeOperator] - const auto success = ClCompositeOperator::validate(workload); // Optional - op.configure(CLKernelLibrary::get().get_compile_context(), workload); - // [Validate and configure ClCompositeOperator] - TOCK(configure, measurements); - - TICK(tensor_allocation); - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// @subsection run_clcompositeoperator Run ClCompositeOperator - /// Construct the runtime CLTensor s with backing memory - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct CLTensor objects - - /// Initialize, allocate and fill the CLTensor objects - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize, Allocate and Fill CLTensor objects - // [Initialize, Allocate and Fill CLTensor objects] - t_input.allocator()->init(t_input_info); - t_weight.allocator()->init(t_weight_info); - t_bias.allocator()->init(t_bias_info); - t_l1_addend.allocator()->init(t_dst_info); - t_dst.allocator()->init(t_dst_info); - - t_input.allocator()->allocate(); - t_weight.allocator()->allocate(); - t_bias.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - - fill_random_tensor(t_input, -1.f, 1.f); - fill_random_tensor(t_weight, -1.f, 1.f); - fill_random_tensor(t_l1_addend, -1.f, 1.f); - // [Initialize, Allocate and Fill CLTensor objects] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// The OpTensorBinding creates a mapping from the OpTensor handles that we created early to the real CLTensors - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Create OpTensorBinding - // [Create OpTensorBinding] - OpTensorBinding op_tensors({ { op_t_input, &t_input }, - { op_t_weight, &t_weight }, - { op_t_bias, &t_bias }, - { op_t_l1_addend, &t_l1_addend }, - { op_t_dst, &t_dst } - }); - // [Create OpTensorBinding] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// Bind the CLTensor objects to the prepare_pack_map and run_pack_map, which are used to prepare and run the op - /// This step additionally creates empty auxiliary CLTensor objects if any, and contain them inside a ClAuxTensorData aux_tensor_data - /// @note This step associates all the CLTensors contained in op_tensors and aux_tensor_data, with prepare_pack_map and run_pack_map - /// Make sure these CLTensors remain valid as long as the two pack_maps are still in use - - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClAuxTensorData - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct TensorPackMaps - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Bind Tensors - // [Bind Tensors] - bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, op_tensors); - // [Bind Tensors] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// Initialize and Allocate Auxiliary CLTensor objects. - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize and Allocate Auxiliary CLTensor objects - // [Initialize and Allocate Auxiliary CLTensor objects] - for(auto tensor_data : aux_tensor_data.get_tensors()) - { - tensor_data.tensor->allocator()->init(tensor_data.tensor_info); - tensor_data.tensor->allocator()->allocate(); - } - // [Initialize and Allocate Auxiliary CLTensor objects] - TOCK(tensor_allocation, measurements); - - TICK(dummy_run); - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like - /// reshaping tensors for a more performant format. - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Prepare ClCompositeOperator - // [Prepare ClCompositeOperator] - op.prepare(prepare_pack_map); - // [Prepare ClCompositeOperator] - - /// @page example_dynamic_fusion_cl_conv2d_elementwise_add - /// At last, we run our operator - /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Run ClCompositeOperator - // [Run ClCompositeOperator] - op.run(run_pack_map); - // [Run ClCompositeOperator] - CLScheduler::get().sync(); - TOCK(dummy_run, measurements); - TOCK(startup_time, measurements); - return true; - } - void do_run() override - { - // Run the fused op - op.run(run_pack_map); - - // Make sure all the OpenCL jobs are done executing: - CLScheduler::get().sync(); - } - - void do_teardown() override - { - for(const auto &m : measurements) - { - std::cout << m.first << ": " << m.second.count() << "us" << std::endl; - } - } - -private: - // [Construct CLTensor objects] - CLTensor t_input{}; - CLTensor t_weight{}; - CLTensor t_bias{}; - CLTensor t_l1_addend{}; - CLTensor t_dst{}; - // [Construct CLTensor objects] - // [Construct ClAuxTensorData] - ClAuxTensorData aux_tensor_data{}; - // [Construct ClAuxTensorData] - // [Construct TensorPackMaps] - TensorPackMap prepare_pack_map{}; - TensorPackMap run_pack_map{}; - // [Construct TensorPackMaps] - // [Construct ClCompositeOperator] - ClCompositeOperator op{}; - // [Construct ClCompositeOperator] - CLTuner tuner{}; - std::map measurements{}; -}; - -/** Main program for sgemm test - * - * @param[in] argc Number of arguments - * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta ) - */ -int main(int argc, char **argv) -{ - return utils::run_example(argc, argv); -} - -#undef TICK -#undef TOCK -#undef TOCK_AVG -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ diff --git a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp deleted file mode 100644 index 3aedcc0f41..0000000000 --- a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */ -#error "This example needs to be built with -DARM_COMPUTE_CL" -#endif /* ARM_COMPUTE_CL */ - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/CL/CLTuner.h" -#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" -#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "utils/TypePrinter.h" -#include "utils/Utils.h" - -#include - -using namespace arm_compute; -using namespace utils; - -#define TICK(clock_name) \ - auto clock_name##_tick = std::chrono::high_resolution_clock::now(); -#define TOCK(clock_name, measurement_map) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); -#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); - -using std::chrono::duration_cast; -using std::chrono::microseconds; -/** A reference for comparing against the fusion of a direct convolution with an elementwise addition: - * examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp - */ -class ClRefConv2dEltwiseAddExample : public Example -{ -public: - bool do_setup(int argc, char **argv) override - { - size_t ih; - size_t iw; - size_t ifm; - size_t wh; - size_t ww; - size_t ofm; - size_t tuner_choice; - unsigned int pad_x; - unsigned int pad_y; - if(argc < 10) - { - // Print help - std::cout << "Usage: ./cl_ref_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n"; - std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n"; - ih = 512; - iw = 512; - ifm = 64; - wh = 1; - ww = 1; - ofm = 3; - tuner_choice = 2; - pad_x = 0; - pad_y = 0; - } - else - { - ih = strtol(argv[1], nullptr, 10); - iw = strtol(argv[2], nullptr, 10); - ifm = strtol(argv[3], nullptr, 10); - wh = strtol(argv[4], nullptr, 10); - ww = strtol(argv[5], nullptr, 10); - ofm = strtol(argv[6], nullptr, 10); - tuner_choice = strtol(argv[7], nullptr, 10); - pad_x = strtol(argv[8], nullptr, 10); - pad_y = strtol(argv[9], nullptr, 10); - } - - CLTuner *tuner_to_use; - switch(tuner_choice) - { - case 0: - { - tuner_to_use = nullptr; - break; - } - case 1: - { - tuner.set_tuner_mode(CLTunerMode::RAPID); - tuner_to_use = &tuner; - break; - } - case 3: - { - tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE); - tuner_to_use = &tuner; - break; - } - case 2: - default: - { - tuner.set_tuner_mode(CLTunerMode::NORMAL); - tuner_to_use = &tuner; - break; - } - } - - CLScheduler::get().default_init(tuner_to_use); - - TICK(startup_time); - TICK(configure); - - /* Computation: - * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - const PadStrideInfo conv_info{ 1, 1, pad_x, pad_y }; - const auto t_input_shape = TensorShape(ifm, iw, ih); - const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm); - const auto t_bias_shape = TensorShape(ofm); - const auto t_l1_addend_shape = TensorShape(ofm, iw); - const auto t_dst_shape = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info); - std::cout << "input_shape: " << t_input_shape << std::endl; - std::cout << "weight_shape: " << t_weight_shape << std::endl; - std::cout << "bias_shape: " << t_bias_shape << std::endl; - std::cout << "addend_shape: " << t_l1_addend_shape << std::endl; - std::cout << "dst_shape: " << t_dst_shape << std::endl; - auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); - auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); - auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout); - auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Intermediate tensor for cond3 - auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); - - // Init tensors - { - t_input.allocator()->init(t_input_info); - t_weight.allocator()->init(t_weight_info); - t_bias.allocator()->init(t_bias_info); - t_l1_addend.allocator()->init(t_dst_info); - t_l0_dst.allocator()->init(t_l0_dst_info); - t_dst.allocator()->init(t_dst_info); - } - - op0.configure(&t_input, &t_weight, &t_bias, &t_l0_dst, conv_info); - op1.configure(&t_l0_dst, &t_l1_addend, &t_dst, ConvertPolicy{}); - TOCK(configure, measurements); - - TICK(tensor_allocation); - // Construct tensors - // Allocate and fill tensors - { - t_input.allocator()->allocate(); - t_weight.allocator()->allocate(); - t_bias.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - t_l0_dst.allocator()->allocate(); - t_dst.allocator()->allocate(); - fill_random_tensor(t_input, -1.f, 1.f); - fill_random_tensor(t_weight, -1.f, 1.f); - fill_random_tensor(t_bias, -1.f, 1.f); - fill_random_tensor(t_l1_addend, -1.f, 1.f); - } - TOCK(tensor_allocation, measurements); - // Dummy run for CLTuner - TICK(dummy_run); - op0.run(); - CLScheduler::get().sync(); - TOCK(dummy_run, measurements); - TOCK(startup_time, measurements); - return true; - } - void do_run() override - { - // Run the ops - op0.run(); - op1.run(); - - // Make sure all the OpenCL jobs are done executing: - CLScheduler::get().sync(); - } - - void do_teardown() override - { - for(auto m : measurements) - { - std::cout << m.first << ": " << m.second.count() << "us" << std::endl; - } - } - -private: - CLTensor t_input{}; - CLTensor t_weight{}; - CLTensor t_bias{}; - CLTensor t_l1_addend{}; - CLTensor t_l0_dst{}; - CLTensor t_dst{}; - CLDirectConvolutionLayer op0{}; - CLArithmeticAddition op1{}; - CLTuner tuner{}; - std::map measurements{}; -}; - -/** Main program for sgemm test - * - * @param[in] argc Number of arguments - * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta ) - */ -int main(int argc, char **argv) -{ - return utils::run_example(argc, argv); -} - -#undef TICK -#undef TOCK -#undef TOCK_AVG \ No newline at end of file diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py index 0a0de84bab..d718af0f21 100755 --- a/scripts/clang_tidy_rules.py +++ b/scripts/clang_tidy_rules.py @@ -15,7 +15,6 @@ def get_list_flags( filename, arch): flags = ["-std=c++14"] flags.append("-DARM_COMPUTE_CPP_SCHEDULER=1") flags.append("-DARM_COMPUTE_CL") - flags.append("-DENABLE_EXPERIMENTAL_DYNAMIC_FUSION") if arch == "aarch64": flags.append("-DARM_COMPUTE_AARCH64_V8_2") return flags diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index 224b68af70..5d5b636cf4 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -37,19 +37,6 @@ #include -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClExecutionDescriptor; -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - namespace arm_compute { namespace @@ -345,14 +332,6 @@ public: { ARM_COMPUTE_UNUSED(tensors, window, queue); } - -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - /// The execution is carried out through run_op method. But the run_op method needs to be extended to include ClExecutionDescriptor as now LWS GWS tuning will be separated from the IKernel - virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) - { - ARM_COMPUTE_UNUSED(tensors, window, queue, exec_desc); - } -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) /** Add the passed parameters to the object's kernel's arguments starting from the index idx. * * @param[in,out] idx Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set. diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp deleted file mode 100644 index 9b6daae619..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ClKernelBlueprint::ClKernelBlueprint() - : _impl{ std::make_unique() } -{ -} - -ClKernelBlueprint::~ClKernelBlueprint() = default; - -ClKernelBlueprint::Implementation &ClKernelBlueprint::impl() -{ - return *_impl; -} -const ClKernelBlueprint::Implementation &ClKernelBlueprint::impl() const -{ - return *_impl; -} - -Status add_tensor(ClKernelBlueprint &kernel_blueprint, ITensorInfo *tensor_info, ArgumentID &id, ArgumentID merge_point) -{ - id = kernel_blueprint.impl().add_kernel_tensor(tensor_info, merge_point); - return Status{}; -} - -Status add_kcomp_eltwise_op(ClKernelBlueprint &kernel_blueprint, const ClElementwiseKernelDescriptor &desc, - ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id) -{ - kernel_blueprint.impl().add_component( - std::make_unique( - &kernel_blueprint, - desc, - SharedVarLink{ src0_id, SharedVarIO::Input }, - SharedVarLink{ src1_id, SharedVarIO::Input }, - SharedVarLink{ dst_id, SharedVarIO::Output })); - - return Status{}; -} - -Status add_kcomp_floor(ClKernelBlueprint &kernel_blueprint, const ClFloorKernelDescriptor &, - ArgumentID src_id, ArgumentID &dst_id) -{ - kernel_blueprint.impl().add_component( - std::make_unique( - &kernel_blueprint, - SharedVarLink{ src_id, SharedVarIO::Input }, - SharedVarLink{ dst_id, SharedVarIO::Output })); - - return Status{}; -} - -Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &) -{ - return Status{}; -} - -Status add_kcomp_direct_conv2d(ClKernelBlueprint &kernel_blueprint, - const ClDirectConv2dKernelDescriptor &direct_conv2d_desc, - ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id) -{ - kernel_blueprint.impl().add_component( - std::make_unique( - &kernel_blueprint, - direct_conv2d_desc, - SharedVarLink{ src_id, SharedVarIO::Input }, - SharedVarLink{ weight_id, SharedVarIO::Input }, - SharedVarLink{ dst_id, SharedVarIO::Output }, - SharedVarLink{ bias_id, SharedVarIO::Input })); - - return Status{}; -} - -Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const StoreType &store_type, ArgumentID src_tile, ArgumentID dst_tile) -{ - switch(store_type) - { - case StoreType::StoreBlockBoundaryAware: - kernel_blueprint.impl().add_component( - std::make_unique( - &kernel_blueprint, - SharedVarLink{ src_tile, SharedVarIO::Input }, - SharedVarLink{ dst_tile, SharedVarIO::Output })); - break; - case StoreType::TStoreIndirectWidthSelect: - kernel_blueprint.impl().add_component( - std::make_unique( - &kernel_blueprint, - SharedVarLink{ src_tile, SharedVarIO::Input }, - SharedVarLink{ dst_tile, SharedVarIO::Output })); - break; - default: - ARM_COMPUTE_ERROR("Store mode not yet supported."); - } - - return Status{}; -} - -Status update_merge_point(ClKernelBlueprint &bp, ArgumentID t_id, ArgumentID merge_point) -{ - return bp.impl().update_merge_point(t_id, merge_point); -} - -Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info) -{ - bp.impl().set_tile_info(tile_info); - return Status{}; -} -Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &kernel_blueprint) -{ - kernel_blueprint.impl().finalize(); - code.name = kernel_blueprint.impl().build_kernel_name(); - code.code = kernel_blueprint.impl().build_code(); - - code.config_id = kernel_blueprint.impl().build_config_id(); - code.build_options = kernel_blueprint.impl().build_options(); - code.window = kernel_blueprint.impl().get_execution_window(); - code.arguments = kernel_blueprint.impl().get_arguments(); - - return Status{}; -} -DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint) -{ - return blueprint.impl().get_graph(); -} -Status tune_static(ClExecutionDescriptor &, const ClKernelCode &) -{ - return Status{}; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h deleted file mode 100644 index 463fc5e7cf..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H -#define ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H - -#include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/ClWorkload.h" -#include "arm_compute/core/experimental/DependencyGraph.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -using ArgumentID = DependencyGraph::Id; - -static constexpr ArgumentID g_arg_placeholder = DependencyGraph::empty_id(); - -/** Intermediate representation of the final, complete kernel source. */ -class ClKernelBlueprint -{ -public: - ClKernelBlueprint(); - ~ClKernelBlueprint(); - -private: - struct Implementation; - std::unique_ptr _impl; - -public: - Implementation &impl(); - const Implementation &impl() const; -}; - -///// Kernel Components ///// -/** Component: Eltwise Operator */ -Status add_kcomp_eltwise_op(ClKernelBlueprint &, const ClElementwiseKernelDescriptor &, ArgumentID src0_id, - ArgumentID src1_id, ArgumentID &dst_id); - -/** Component: Floor */ -Status add_kcomp_floor(ClKernelBlueprint &, const ClFloorKernelDescriptor &, ArgumentID src_id, - ArgumentID &dst_id); - -/** Component: Activation */ -Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id); - -/** Component: Direct Convolution **/ -Status add_kcomp_direct_conv2d(ClKernelBlueprint &, const ClDirectConv2dKernelDescriptor &, - ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id); - -Status add_kcomp_store(ClKernelBlueprint &, const StoreType &store_type, ArgumentID src_id, ArgumentID dst_id); - -Status add_tensor(ClKernelBlueprint &, ITensorInfo *, ArgumentID &, ArgumentID merge_point = DependencyGraph::empty_id()); - -///// Kernel Components ///// - -///// Building ///// - -/** Update existing merge tensor @p merge_point to point to @p t_id - * - * @param t_id - * @param merge_point - * @return Status - */ -Status update_merge_point(ClKernelBlueprint &, ArgumentID t_id, ArgumentID merge_point); - -/** Get dependency graph - * - * @return DependencyGraph - */ -DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint); - -/** All information required for building the @ref ClKernelCode */ -struct ClCodeBuilderContext -{ - GpuInfo gpu_info{}; -}; - -Status set_tile_info(ClKernelBlueprint &, const TileDescriptor &); - -/** Build final kernel source from KernelBlueprint */ -Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &); - -///// Building ///// - -///// Tuning ///// - -Status tune_static(ClExecutionDescriptor &, const ClKernelCode &); - -///// Tuning ///// - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h deleted file mode 100644 index 04919acb83..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h +++ /dev/null @@ -1,930 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H - -#include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/GPUTarget.h" -#include "src/core/common/Macros.h" -#include "support/Requires.h" -#include "support/StringSupport.h" - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" - -#include -#include -#include -#include -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -/** We introduce the concept of *Shared Variables* in the context of kernel building. - * They are variables that can be accessed / shared among all the kernel components within a single kernel. - * For now we consider 2 groups of shared variables: - * Argument: The argument variables (parameters) of a kernel - * Automatic: The automatic variables declared inside a kernel - * All Shared Variables have the same kernel scope, and are thus visible to all kernel components -*/ - -enum class SharedVarIO -{ - Input, - Output -}; - -enum class SharedVarGroup -{ - Argument, // Parameters to a kernel function == dst or src tensors of the whole blueprint graph - Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph -}; - -/** Specifies a shared variable link for a component. - * It describes all the information that's available when a component is constructed / added: - * e.g. its linkage (via ArgumentID and io) and its group - * This is not shared variable on its own, but is used for instantiating a SharedVar when building the code - */ -struct SharedVarLink -{ - ArgumentID arg_id{ g_arg_placeholder }; - SharedVarIO io{ SharedVarIO::Input }; - bool is_empty() const - { - return arg_id == g_arg_placeholder; - } -}; - -/** A table of all the variables used in the kernel / blueprint - * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) ** - * - * NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID - * NOTE: the variables all have the scope of the full kernel function - */ -class SharedVarTable -{ -public: - /** A fully realized SharedVarLink - */ - struct SharedVar - { - ArgumentID arg_id{ g_arg_placeholder }; - SharedVarIO io{ SharedVarIO::Input }; - SharedVarGroup group{ SharedVarGroup::Argument }; - std::string uniq_name{}; // Unique name, also the final variable name used in the built code - ClKernelArgDescriptor desc{}; // Automatic variables can and should still be described using this struct - bool is_empty() const - { - return arg_id == g_arg_placeholder; - } - }; - - class Arguments - { - public: - Arguments() = default; - void add_var(const SharedVar &var) - { - ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument); - _vars.push_back(var); - } - std::vector get_all_vars() const - { - return _vars; - } - std::vector get_src_vars() const - { - std::vector src_vars; - std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var) - { - return var.io == SharedVarIO::Input; - }); - return src_vars; - } - SharedVar get_dst_var() const - { - std::vector dst_vars; - std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var) - { - return var.io == SharedVarIO::Output; - }); - ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1); - return dst_vars.at(0); - } - - private: - std::vector _vars{}; - }; - - /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one - * - * @note: The order of insertion is important. There is one precondition: - * PRECOND: The components have been sorted topologically / is being traversed in topological order - * This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred - */ - void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed") - { - ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected"); - if(!get(var_link).is_empty()) - { - return; - } - - auto var_id = _num_var; - std::stringstream ss; - ss << name << "_" << var_id; - const auto uniq_name = ss.str(); - SharedVar var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc }; - - if(group == SharedVarGroup::Argument) - { - _arguments.emplace(var_id, var); - _arg_id_map.emplace(var_link.arg_id, var_id); - _num_var++; - } - else if(group == SharedVarGroup::Automatic) - { - if(_global_vars.empty()) - { - if(var_link.io == SharedVarIO::Output) - { - _global_vars.emplace(var_id, var); - _arg_id_map.emplace(var_link.arg_id, var_id); - _num_var++; - } - else - { - ARM_COMPUTE_ERROR("Component likely not traversed in topological order"); - } - } - else - { - // Associate additional SharedVarLinks with the single global shared variable - const auto global_var_id = _global_vars.begin()->first; - _arg_id_map[var_link.arg_id] = global_var_id; - } - } - else - { - ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup"); - } - } - - /** Get the SharedVar associated with @p var_link - * - * @param var_link - * @return SharedVar - */ - SharedVar get(const SharedVarLink &var_link) const - { - const SharedVar empty_var{}; - if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end()) - { - const auto var_id = _arg_id_map.at(var_link.arg_id); - const auto arg_var = _arguments.find(var_id); - if(arg_var != _arguments.end()) - { - return arg_var->second; - } - else - { - return _global_vars.at(var_id); - } - } - return empty_var; - } - - /** @note The arguments are returned in the order they are added - */ - Arguments get_kernel_arguments() const - { - Arguments args{}; - for(const auto &a : _arguments) - { - args.add_var(a.second); - } - return args; - } - -private: - using VarID = int32_t; - -private: - std::map _global_vars{}; // Shared, global variable - std::map _arguments{}; - std::map _arg_id_map{}; // Track ArgumentIDs that have already been added - VarID _num_var{ 0 }; -}; - -enum class ComponentType -{ - Simple, - Complex, - Store -}; - -using ComponentID = DependencyGraph::Id; -using ComponentList = std::vector; -class IClKernelComponent -{ -public: - using Link = SharedVarLink; - using Tag = std::string; - struct TagVal - { - TagVal() = default; - TagVal(const SharedVarTable::SharedVar &var) - : value{ var.uniq_name } - { - } - - template ::value)> - TagVal(T val) - : value{ support::cpp11::to_string(val) } - { - } - - TagVal(const std::string &val) - : value{ val } - { - } - - TagVal(const char *val) - : value{ std::string(val) } - { - } - - TagVal(const DataType &data_type) - : value{ get_cl_type_from_data_type(data_type) } - { - } - - std::string value{}; - }; - using TagLUT = std::unordered_map; // Used to instantiating a code template / replacing tags -public: - IClKernelComponent(ClKernelBlueprint *blueprint) - : _blueprint(blueprint) - { - } - - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClKernelComponent); - - virtual ~IClKernelComponent() = default; - virtual ComponentType get_component_type() const = 0; - virtual std::vector get_links() const = 0; - virtual std::string name() const = 0; - - // @note: some tags can be unused since they could be used only for the macros, or only for the component code - static std::string replace_tags(const std::string &code_template, const TagLUT &tags) - { - std::string replaced_code = ""; - bool scanning_pattern = false; - std::string pattern_found = ""; - for(size_t i = 0; i < code_template.size() - 1; ++i) - { - if(!scanning_pattern) - { - if(code_template[i] == '{' && code_template[i + 1] == '{') - { - i += 1; - scanning_pattern = true; - pattern_found = ""; - } - else - { - replaced_code += code_template[i]; - } - } - else - { - if(code_template[i] == '}' && code_template[i + 1] == '}') - { - i += 1; - scanning_pattern = false; - std::string err = "Pattern " + pattern_found + " not found in tags"; - ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str()); - replaced_code += tags.find(pattern_found)->second.value; - } - else - { - pattern_found += code_template[i]; - } - } - } - - return replaced_code; - } - ComponentID id() const - { - return _id; - } - void set_id(ComponentID id) - { - _id = id; - } - - virtual std::set get_headers_list() const - { - return std::set {}; - } - - virtual std::string get_additional_macros() const - { - return ""; - } - - virtual std::string get_component_code() const - { - return ""; - } - - virtual Window get_window() const - { - return Window{}; - } - - /** Get the tag look-up table used to instantiate the component code. - * - * @param vtable - * @return TagLUT - */ - virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0; - - /** Allocate all shared variables used by the component in the @p vtable - * - * @param vtable - */ - virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0; - - virtual std::string get_dst_addr_calculation() const - { - return ""; - } - - /** Generate config id of the component - * - * @return std::string - */ - virtual std::string generate_config_id() const - { - return ""; - } - - virtual CLBuildOptions generate_build_options() const - { - return CLBuildOptions{}; - } - -protected: - ClKernelBlueprint *_blueprint; - -private: - ComponentID _id{}; -}; - -using ComponentUniquePtr = std::unique_ptr; - -/** Intermediate representation of the final, complete kernel source. - */ -struct ClKernelBlueprint::Implementation -{ -public: - Implementation() = default; - ~Implementation() = default; - -public: - Status update_merge_point(ArgumentID t_id, ArgumentID merge_point) - { - return _graph.update_merge_point(t_id, merge_point); - } - - ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id()) - { - const auto id = _graph.add_tensor(merge_point); - if(_kernel_tensors.find(id) == _kernel_tensors.end()) - { - _kernel_tensors.insert(std::make_pair(id, tensor_info)); - } - return id; - } - - void set_tile_info(const TileDescriptor &tile_info) - { - _tile_info = tile_info; - } - - SharedVarGroup group(ArgumentID arg_id) const - { - if(arg_id == g_arg_placeholder) - { - // In case of placeholder, don't care what we return; - return SharedVarGroup::Argument; - } - return _shared_var_group_lut.at(arg_id); - } - - void validate_arg_ids(std::initializer_list args) const - { - for(const auto arg_id : args) - { - ARM_COMPUTE_UNUSED(arg_id); - ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder, - "Trying to use an argument that hasn't been added to the blueprint"); - } - } - - void add_component(ComponentUniquePtr component) - { - if(component->get_component_type() == ComponentType::Complex) - { - ++_num_complex_components; - ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported."); - } - - // Get an unique ID for the component that's being added - std::vector src_tensors; - std::vector dst_tensors; - for(const auto &link : component->get_links()) - { - if(link.is_empty()) - { - continue; - } - if(link.io == SharedVarIO::Input) - { - src_tensors.push_back(link.arg_id); - } - else - { - dst_tensors.push_back(link.arg_id); - } - } - const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second; - component->set_id(component_id); - - // Add this component to the component graph. Don't connect it to anything yet - _component_graph.emplace(component_id, ComponentList{}); - - // For every { arg_id, arg_io } passed along with this component... - for(const auto &link : component->get_links()) - { - const ArgumentID &arg_id = link.arg_id; - const SharedVarIO &arg_io = link.io; - - // Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there - if(_outgoing_components.find(arg_id) == _outgoing_components.end()) - { - _outgoing_components.emplace(arg_id, ComponentList{}); - _incoming_components.emplace(arg_id, ComponentList{}); - } - - // If it's an input argument, connect any other component that has it as output with this component - // Additionally, set this component as one that treats this argument as "Input" (append to index 0) - // This is used so that we keep track of whether two components use the same argument, one as input and one as output - if(arg_io == SharedVarIO::Input) - { - for(const auto &prev_component : _incoming_components[arg_id]) - { - _component_graph[prev_component].push_back(component_id); - } - - _outgoing_components[arg_id].push_back(component_id); - } - // If it's an output argument, connect this component with any other component that has it as input - // Additionally, set this component as one that treats this argument as "Output" (append to index 1) - else - { - if(component->get_component_type() == ComponentType::Store) - { - ARM_COMPUTE_ERROR_ON_MSG(_dst_id >= 0, "Trying to add more than one dst argument to the graph"); - _dst_id = arg_id; - } - - for(const auto &subseq_component : _outgoing_components[arg_id]) - { - _component_graph[component_id].push_back(subseq_component); - } - - _incoming_components[arg_id].push_back(component_id); - } - } - - ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph"); - - // Finally, add this component to the dictionary of components - _components.insert(std::make_pair(component_id, std::move(component))); - } - - std::string build_kernel_name() const - { - std::string name = ""; - - traverse([&](std::stack stack) - { - name += _components.find(stack.top())->second->name() + (stack.size() > 2 ? "___" : ""); - }); - - return name; - } - - std::string build_code() - { - ARM_COMPUTE_ERROR_ON_MSG(_graph_root == -1, "No root found in the component graph"); - - // These data structures will hold the data from all the components in the blueprint - std::set headers_list{}; - std::set additional_macros{}; - std::vector component_codes{}; // vector because order matters - - // Step 1: Allocate all kernel argument shared variables before generating the component code - auto stack = topological_sort(); - while(!stack.empty()) - { - auto curr_component_id = stack.top(); - auto &curr_component = _components.find(curr_component_id)->second; - - curr_component->allocate_shared_vars(_vtable); - - stack.pop(); - } - // Step 2: Generate component codes - stack = topological_sort(); - while(!stack.empty()) - { - auto curr_component_id = stack.top(); - auto &curr_component = _components.find(curr_component_id)->second; - - auto curr_headers_list = curr_component->get_headers_list(); - auto curr_additional_macros = curr_component->get_additional_macros(); - auto curr_component_code = curr_component->get_component_code(); - const auto var_lut = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique - component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut)); - - headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); - if(!curr_additional_macros.empty()) // Some components might not have any - { - additional_macros.insert(IClKernelComponent::replace_tags(curr_additional_macros, var_lut)); - } - - stack.pop(); - } - - // Step 3: Assemble the data gathered by traversing the graph into the string "code" - std::string code = ""; - - for(auto &header : headers_list) - { -#if defined(EMBEDDED_KERNELS) - code += CLKernelLibrary::get().get_program(header).first; -#else // defined(EMBEDDED_KERNELS) - code += "#include \"" + header + "\"\n"; -#endif // defined(EMBEDDED_KERNELS) - } - - for(auto ¯os : additional_macros) - { - code += macros; - } - - code += generate_kernel_signature(_vtable.get_kernel_arguments()); - - code += "\n{\n\n"; - - code += " //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n"; - code += generate_global_section(); - code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n"; - - for(auto &component_code : component_codes) - { - code += component_code; - } - - code += "}\n"; - - return code; - } - - /** Generate config id of the entire kernel - * - * Format: kernel_name--comp0_config_id--comp1_config_id--... - * - * @return std::string - */ - std::string build_config_id() const - { - std::string config_id = build_kernel_name(); - traverse([&](std::stack stack) - { - config_id += "--" + _components.find(stack.top())->second->generate_config_id() + "--"; - }); - - return config_id; - } - - CLBuildOptions build_options() const - { - CLBuildOptions build_opts{}; - - traverse([&](std::stack stack) - { - build_opts.add_options(_components.find(stack.top())->second->generate_build_options().options()); - }); - - return build_opts; - } - - TileDescriptor get_tile_info() const - { - return _tile_info; - } - - // Get the global execution window, i.e. that of the root component - Window get_execution_window() const - { - ARM_COMPUTE_ERROR_ON_MSG(_graph_root == -1, "No root found in the component graph"); - ARM_COMPUTE_ERROR_ON_MSG(_dst_id == -1, "Destination Tensor Id should be ready before calling get_execution_window()"); - - return _components.find(_graph_root)->second->get_window(); - } - - ArgumentID get_dst_id() const - { - return _dst_id; - } - - ClKernelArgList get_arguments() const - { - ClKernelArgList arg_list{}; - for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars()) - { - arg_list[arg_var.desc.arg_id] = arg_var.desc; - } - return arg_list; - } - - /** Get the arguments as shared vars from the vtable - * - * @return SharedVarTable::Arguments - */ - SharedVarTable::Arguments get_argument_shared_vars() const - { - return _vtable.get_kernel_arguments(); - } - - const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const - { - auto it = _kernel_tensors.find(id); - if(it != _kernel_tensors.end()) - { - return it->second; - } - return nullptr; - } - - ITensorInfo *get_kernel_argument_info(const ArgumentID id) - { - auto it = _kernel_tensors.find(id); - if(it != _kernel_tensors.end()) - { - return it->second; - } - return nullptr; - } - /** Finalize graph construction. Graph is expected to not mutate after being finalized - */ - void finalize() - { - cache_root_component(); - assign_shared_var_group(); - } - - DependencyGraph get_graph() const - { - return _graph; - } - -private: - void cache_root_component() - { - const auto roots = _graph.get_root_ops(); - ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph"); - _graph_root = roots.at(0); - } - /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building - */ - void assign_shared_var_group() - { - for(const auto &tensor : _kernel_tensors) - { - const auto tensor_id = tensor.first; - if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id)) - { - _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument; - } - else - { - _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic; - } - } - } - - void topological_sort_utility(ComponentID component_id, std::unordered_set &visited, std::stack &stack) const - { - visited.insert(component_id); - - for(auto connected_component : _component_graph.find(component_id)->second) - { - if(visited.find(connected_component) == visited.end()) - { - topological_sort_utility(connected_component, visited, stack); - } - } - - stack.push(component_id); - } - - std::stack topological_sort() const - { - std::stack stack{}; - std::unordered_set visited{}; - - topological_sort_utility(_graph_root, visited, stack); - - return stack; - } - - void traverse(const std::function)> &func) const - { - std::stack stack = topological_sort(); - - while(!stack.empty()) - { - func(stack); - stack.pop(); - } - } - - std::string generate_argument_declaration(const SharedVarTable::SharedVar &var) const - { - ARM_COMPUTE_ERROR_ON_MSG(var.group != SharedVarGroup::Argument, "An argument declaration can only be generated from a kernel argument"); - std::string code; - switch(var.desc.tensor_arg_type) - { - case ClKernelTensorArgType::Vector: - { - code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")"; - break; - } - case ClKernelTensorArgType::Image: - { - code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")"; - break; - } - case ClKernelTensorArgType::Image_3D: - { - code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),"; - code += "\n uint " + var.uniq_name + "_stride_z"; - break; - } - case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: - { - code += "\n __read_only image2d_t " + var.uniq_name + "_img,"; - code += "\n uint " + var.uniq_name + "_stride_z"; - break; - } - case ClKernelTensorArgType::Tensor_4D_t_Buffer: - { - code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)"; - break; - } - case ClKernelTensorArgType::Tensor_4D_t_Image: - { - code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)"; - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType"); - } - } - return code; - } - - std::string generate_kernel_signature(const SharedVarTable::Arguments &argument_list) const - { - std::string code = "\n__kernel void " + build_kernel_name() + "("; - - for(const auto &arg : argument_list.get_all_vars()) - { - code += generate_argument_declaration(arg) + ","; - } - - code[code.length() - 1] = ')'; - - return code; - } - - std::string generate_global_section() const - { - auto dst_info = get_kernel_argument_info(_dst_id); - auto dst_w = dst_info->dimension(0); - const auto tile_w = std::max(1, get_execution_window().x().step()); - const auto tile_h = std::max(1, get_execution_window().y().step()); - auto leftover_w = dst_w % tile_w; - - std::string code = ""; - code += std::string(" int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; - code += std::string(" int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n"; - code += std::string(" int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n"); - - switch(_tile_info.clipping) - { - case ClippingStrategy::TOP_LEFT: - code += " const bool g_cond_x = (cout == 0);\n"; - code += " const bool g_cond_y = (mout == 0);\n"; - break; - case ClippingStrategy::TOP_RIGHT: - code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; - code += " const bool g_cond_y = (mout == 0);\n"; - break; - case ClippingStrategy::BOTTOM_LEFT: - code += " const bool g_cond_x = (cout == 0);\n"; - code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; - break; - case ClippingStrategy::BOTTOM_RIGHT: - code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n"; - code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n"; - break; - default: - ARM_COMPUTE_ERROR("Unsupported clipping strategy"); - } - - return code; - } - - TileDescriptor _tile_info{}; - - int32_t _num_complex_components{}; - - ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers - - DependencyGraph _graph{}; - - // Tensors, components and IDs with corresponding ptrs (except intermediate) - std::unordered_map _components{}; - std::unordered_map _kernel_tensors{}; - // Argument group lookup. Can be replaced by extending the ArgumentID type to include group info - std::unordered_map _shared_var_group_lut{}; - - // Tracks all variables (e.g.: kernel arguments, kernel "global variables") - SharedVarTable _vtable{}; - - // Component directed graph (represented by an adjecency list of Component IDs) - // This is used to understand the ordering and bindings between components when generating the kernel - // It's initially set to -1 which means the graph has no root yet, since node IDs are positive numbers - ComponentID _graph_root{ -1 }; - std::unordered_map _component_graph{}; - - // Additional data structures used to define the relationships between components and arguments - // For each argument, it contains the list of components that consider it as an incoming or an outgoing argument - // E.g. tensor0 -> component0 -> tensor1 - // _outgoing_components[tensor0] == {component0} (component0 is the outgoing component of tensor0. Component0 treats tensor0 as an input tensor) - // _incoming_components[tensor1] == {component0} (component0 is the incoming component of tensor1. Component1 treats tensor1 as an output tensor) - std::unordered_map _outgoing_components{}; - std::unordered_map _incoming_components{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h deleted file mode 100644 index 1b10050559..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -inline ::std::ostream &operator<<(::std::ostream &os, const CLBuildOptions::StringSet &build_opts) -{ - for(const auto &opt : build_opts) - { - os << opt << ","; - } - return os; -} -inline ::std::ostream &operator<<(::std::ostream &os, const CLBuildOptions &cl_build_opts) -{ - os << cl_build_opts.options(); - return os; -} - -inline std::string to_string(const CLBuildOptions &cl_build_opts) -{ - std::stringstream str; - str << cl_build_opts; - return str.str(); -} -inline ::std::ostream &operator<<(::std::ostream &os, const ClKernelCode &code) -{ - os << "name: " << code.name << std::endl; - os << "code: " << code.code << std::endl; - os << "build_opts: " << code.build_options << std::endl; - return os; -} -inline std::string to_string(const ClKernelCode &code) -{ - std::stringstream str; - str << code; - return str.str(); -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute - -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp deleted file mode 100644 index 811cd79811..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp +++ /dev/null @@ -1,409 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CL/ICLKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include "arm_compute/runtime/CL/CLScheduler.h" -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ComponentType ClDirectConvolutionKernelComponent::get_component_type() const -{ - return ComponentType::Complex; -} - -std::set ClDirectConvolutionKernelComponent::get_headers_list() const -{ - return std::set { "helpers.h", "tile_helpers.h" }; -} - -Window ClDirectConvolutionKernelComponent::get_window() const -{ - const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); - auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - // Get dst shape - PadStrideInfo pad_stride_info - { - static_cast(_desc.conv2d.stride.x()), - static_cast(_desc.conv2d.stride.y()), - static_cast(_desc.conv2d.pad.left), - static_cast(_desc.conv2d.pad.right), - static_cast(_desc.conv2d.pad.top), - static_cast(_desc.conv2d.pad.bottom), - DimensionRoundingType::FLOOR /*default rounding type*/ - }; - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info); - - // Output auto initialization if not yet initialized - auto_init_if_empty(*dst_info, output_shape, - 1, - src_info->data_type(), - src_info->quantization_info()); - - const unsigned int vec_size = std::min(static_cast(dst_info->tensor_shape()[0]), 4u); - const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U; - // const unsigned int num_rows = 1; - // const unsigned int vec_size = tile_info.tile_dims.x(); - // const unsigned int num_rows = tile_info.tile_dims.y(); - - // Create and configure kernel window - Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); - - const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows); - win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows)); - win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1)); - - return win; -} - -std::string ClDirectConvolutionKernelComponent::get_additional_macros() const -{ - return R"_()_"; // no macros -} - -std::string ClDirectConvolutionKernelComponent::get_component_code() const -{ - const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - - ARM_COMPUTE_ERROR_ON_MSG(src_info->data_layout() != DataLayout::NHWC, "Only NHWC data layout is supported by this component."); - - const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); - const auto k0 = adjust_vec_size(is_data_type_quantized(src_info->data_type()) ? 16u : 8u, src_info->dimension(channel_idx)); - const bool leftover_loop = (src_info->dimension(channel_idx) % k0) != 0; - - std::string code = R"_( - //------------------ START KERNEL {{meta_kernel_id}} --------------------- - // IN_0(src) {{src}} - // IN_1(wei) {{weight}} - )_"; - if(bias_info != nullptr) - { - code += R"_( - // IN_1(bia) {{bias}} - )_"; - } - code += R"_( - // OUT(dst, accum) {{dst}} - - // Initialize the accumulators - TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); - { - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. - #define _IWEI_WIDTH {{WEI_WIDTH}} - #define _IWEI_HEIGHT {{WEI_HEIGHT}} - #define _ISRC_WIDTH {{src}}_w - #define _ISRC_HEIGHT {{src}}_h - #define _ISRC_CHANNELS {{src}}_c - #define _IDST_WIDTH {{arg_dst}}_w - #define _IDST_HEIGHT {{arg_dst}}_h - #define _IDST_CHANNELS {{arg_dst}}_c - #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) - - // .v = access the whole vector (OpenCL vector) - // .s[x] = access the vector element at position x (scalar access) - TILE(int, M0, 1, xi); - TILE(int, M0, 1, yi); - - // Convert the linear index to coordinate - LOOP_UNROLLING(int, i, 0, 1, M0, - { - xi[i].v = ((mout + i) % _IDST_WIDTH) * {{STRIDE_X}}; - yi[i].v = ((mout + i) / _IDST_WIDTH) * {{STRIDE_Y}}; - xi[i].v -= {{PAD_LEFT}}; - yi[i].v -= {{PAD_TOP}}; - }) - - LOOP_UNROLLING(int, i, 0, 1, M0, - { - {{dst}}[i].v = 0; - }) - - for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) - { - int ck = 0; - int xk = i % _IWEI_WIDTH; - int yk = i / _IWEI_HEIGHT; - - int k = 0; - for(; k <= (_ISRC_CHANNELS - K0); k += K0) - { - TILE({{SRC_DATA_TYPE}}, M0, K0, a); - TILE({{WEI_DATA_TYPE}}, N0, K0, b); - - LOOP_UNROLLING(int, i, 0, 1, M0, - { - a[i].v = {{ZERO_VALUE}}; - }) - - // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); - - // Load tile from the weights tensor - T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); - - // Compute the matrix multiplication between two tiles - T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}}); - - ck += K0; - } - - // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS - // This #if directive should be removed in case of dynamic tensor support - )_"; - - if(leftover_loop) - { - code += R"_( - // Left-over accumulations - for(; k < _ISRC_CHANNELS; ++k) - { - TILE({{SRC_DATA_TYPE}}, M0, 1, a); - TILE({{WEI_DATA_TYPE}}, N0, 1, b); - - LOOP_UNROLLING(int, i, 0, 1, M0, - { - a[i].v = {{ZERO_VALUE}}; - }) - - // Load tile from the src tensor - T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); - - // Load tile from the weights tensor - // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration - T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); - - // Compute the matrix multiplication between two tiles - T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); - - ++ck; - } - )_"; - } - - code += R"_( - #undef _I_WEI_WIDTH - #undef _I_WEI_HEIGHT - #undef _ISRC_WIDTH - #undef _ISRC_HEIGHT - #undef _ISRC_CHANNELS - #undef _IDST_WIDTH - #undef _IDST_HEIGHT - #undef _IDST_CHANNELS - #undef _IY_MULTIPLIER - - } - )_"; - - if(bias_info != nullptr) - { - code += R"_( - TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); - - T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0); - - // c = c + bias[broadcasted] - T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); - )_"; - } - - code += R"_( - } -//------------------ END KERNEL {{meta_kernel_id}} --------------------- - )_"; - return code.c_str(); -} - -bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) -{ - if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) - { - return false; - } - - // If not floating point - if(!is_data_type_float(tensor->data_type())) - { - return false; - } - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) - { - return false; - } - - // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) - { - return false; - } - - const size_t image_w = tensor->tensor_shape()[0] / 4; - const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; - const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); - const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); - - if(image_w > max_image_w || image_h > max_image_h) - { - return false; - } - - return true; -} - -CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const -{ - const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); - const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - // const auto tile_info = _blueprint->impl().get_tile_info(); - - const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL); - const DataType data_type = src_info->data_type(); - const GPUTarget gpu_target = CLScheduler::get().target(); - - const unsigned int n0 = _blueprint->impl().get_execution_window().x().step(); - const unsigned int m0 = _blueprint->impl().get_execution_window().y().step(); - const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst_info->dimension(0) % n0; - const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); - - // Update the padding for the weights tensor if we can export to cl_image - if(export_to_cl_image) - { - arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weight_info); - } - - CLBuildOptions build_opts{}; - build_opts.add_option("-cl-fast-relaxed-math"); - build_opts.add_option("-DIS_TILED"); - build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(k0)); - build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - - return build_opts; -} - -void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const -{ - const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); - - vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); - - const GPUTarget gpu_target = CLScheduler::get().target(); - const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout()); - const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer; - vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight"); - - if(!_bias.is_empty()) // optional bias - { - vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias"); - } - vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); -} - -ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const -{ - TagLUT lut{}; - - const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id); - const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id); - - // Arguments and global shared variables - lut["src"] = vtable.get(_src); - lut["weight"] = vtable.get(_weight); - - if(!_bias.is_empty()) // optional bias - { - lut["bias"] = vtable.get(_bias); - lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type()); - } - lut["dst"] = vtable.get(_dst); - - const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var(); - lut["arg_dst"] = dst_argument.uniq_name; - - // Local build options - lut["meta_kernel_id"] = id(); - lut["ACC_DATA_TYPE"] = src_info->data_type(); - lut["SRC_DATA_TYPE"] = src_info->data_type(); - lut["WEI_DATA_TYPE"] = weight_info->data_type(); - - lut["SRC_TENSOR_TYPE"] = "BUFFER"; - switch(vtable.get(_weight).desc.tensor_arg_type) - { - case ClKernelTensorArgType::Image_Export_To_ClImage2D: - case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: - case ClKernelTensorArgType::Tensor_4D_t_Image: - { - lut["WEI_TENSOR_TYPE"] = "IMAGE"; - break; - } - default: - { - lut["WEI_TENSOR_TYPE"] = "BUFFER"; - break; - } - } - const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH); - const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT); - lut["WEI_WIDTH"] = weight_info->dimension(width_idx); - lut["WEI_HEIGHT"] = weight_info->dimension(height_idx); - - lut["STRIDE_X"] = _desc.conv2d.stride.x(); - lut["STRIDE_Y"] = _desc.conv2d.stride.y(); - - lut["PAD_LEFT"] = _desc.conv2d.pad.left; - lut["PAD_TOP"] = _desc.conv2d.pad.top; - - lut["ZERO_VALUE"] = 0; - - return lut; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h deleted file mode 100644 index 5babdbab51..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" - -#include "utils/TypePrinter.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -class ClDirectConvolutionKernelComponent : public IClKernelComponent -{ -public: - ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc, - const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{}) - : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst } - { - } - - ComponentType get_component_type() const override; - std::set get_headers_list() const override; - std::string get_additional_macros() const override; - std::string get_component_code() const override; - Window get_window() const override; - ClKernelArgList get_args(); - CLBuildOptions generate_build_options() const override; - - virtual std::vector get_links() const override - { - return { _src, _weight, _bias, _dst }; - } - - virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; - virtual void allocate_shared_vars(SharedVarTable &vtable) const override; - - virtual std::string name() const override - { - return "direct_convolution_" + to_string(_blueprint->impl().get_kernel_argument_info(_src.arg_id)->data_layout()) + "_" + std::to_string(id()); - } - -private: - ClDirectConv2dKernelDescriptor _desc{}; - Link _src{}; - Link _weight{}; - Link _bias{}; - Link _dst{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp deleted file mode 100644 index e2eba68a63..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ComponentType ClElementwiseKernelComponent::get_component_type() const -{ - return ComponentType::Simple; -} - -std::set ClElementwiseKernelComponent::get_headers_list() const -{ - return std::set { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" }; -} - -Window ClElementwiseKernelComponent::get_window() const -{ - const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id); - const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info); - - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info); - const TensorShape &out_shape = broadcast_pair.first; - - auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type()); - - TensorShape output_shape = dst_info->tensor_shape(); - // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged - // This is in line with the collapsing convention used by Conv2d - output_shape.collapse(2U, 1U); - const unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); - - return win; -} - -std::string ClElementwiseKernelComponent::get_component_code() const -{ - std::string code; - const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; - - if(is_root) - { - return R"_( - //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- - // IN_0(LHS) {{lhs}} - // IN_1(RHS) {{rhs}} - // OUT(dst, accum) {{dst}} - - // dst = lhs + rhs (mix-precision, broadcast, boundary aware) - TILE({{DATA_TYPE}}, M0, N0, {{dst}}); - { - TILE({{DATA_TYPE}}, M0, N0, lhs_tile); - TILE({{DATA_TYPE}}, M0, N0, rhs_tile); - - // Since mout maps to dimensions 1 (y) and dimension 2 (z) of the input tensor because of the collapsed window, bout maps to dimension 3 (w) - {{lhs}}_offset_first_element_in_bytes += bout * {{lhs}}_stride_w; - {{rhs}}_offset_first_element_in_bytes += bout * {{rhs}}_stride_w; - - T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile); - T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_x}}, {{rhs_start_y}}, 1, {{rhs}}_stride_y, rhs_tile); - -#if defined(IS_BROADCAST) - T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); -#else // !defined(IS_BROADCAST) - T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); -#endif // defined(IS_BROADCAST) - - } - //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- -)_"; - } - else - { - return R"_( - //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- - // IN_0/Out(Accumulator) {{acc}} - // IN_1(Addend) {{addend}} - - // acc = addend + acc (mix-precision, broadcast, boundary aware) - { - TILE({{DATA_TYPE}}, M0, N0, addend_tile); - - T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{addend}}, {{rhs_start_x}}, {{rhs_start_y}}, 1, {{addend}}_stride_y, addend_tile); - -#if defined(IS_BROADCAST) - T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}}); -#else // !defined(IS_BROADCAST) - T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}}); -#endif // defined(IS_BROADCAST) - } - //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- -)_"; - } -} - -CLBuildOptions ClElementwiseKernelComponent::generate_build_options() const -{ - const auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - CLBuildOptions build_opts{}; - const auto n0 = _blueprint->impl().get_execution_window().x().step(); - const auto m0 = _blueprint->impl().get_execution_window().y().step(); - const unsigned int partial_store_n0 = t_dst_info->dimension(0) % n0; - const bool is_broadcast = t_rhs_info->tensor_shape() != t_dst_info->tensor_shape(); - - build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(is_broadcast, "-DIS_BROADCAST"); - - return build_opts; -} - -std::string ClElementwiseKernelComponent::generate_config_id() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - std::string config_id{}; - config_id += lower_string(string_from_data_type(t_dst_info->data_type())); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(0)); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(1)); - config_id += "_"; - config_id += lower_string(string_from_data_layout(t_dst_info->data_layout())); - return config_id; -} - -void ClElementwiseKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const -{ - const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; - vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs"); - vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs"); - if(is_root) - { - vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); - } -} - -ClElementwiseKernelComponent::TagLUT ClElementwiseKernelComponent::get_tag_lut(const SharedVarTable &vtable) const -{ - TagLUT lut{}; - const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - ITensorInfo *t_addend_info = nullptr; - // Arguments and global shared variables - const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument; - if(is_root) - { - lut["lhs"] = vtable.get(_lhs); - lut["rhs"] = vtable.get(_rhs); - lut["dst"] = vtable.get(_dst); - t_addend_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id); - } - else - { - // Determine which link is the accumulator - Link accumulator; - Link addend; - if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic) - { - accumulator = _lhs; - addend = _rhs; - } - else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic) - { - accumulator = _rhs; - addend = _lhs; - } - else - { - ARM_COMPUTE_ERROR("Invalid elementwise component linking"); - } - lut["acc"] = vtable.get(accumulator); - lut["addend"] = vtable.get(addend); - t_addend_info = _blueprint->impl().get_kernel_argument_info(addend.arg_id); - } - // Local build options - lut["meta_kernel_id"] = id(); - lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type()); - - switch(_desc.eltwise.op) - { - case ArithmeticOperation::DIV: - lut["ELTWISE_OP"] = "DIV"; - break; - case ArithmeticOperation::ADD: - lut["ELTWISE_OP"] = "ADD"; - break; - default: - ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); - } - - // Set broadcast parameters - // PRE: All tensors are broadcast-compatible - const bool is_broadcast = t_addend_info->tensor_shape() != t_dst_info->tensor_shape(); - if(is_broadcast) - { - // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy - if(t_addend_info->dimension(0) == 1U && t_addend_info->dimension(1) == 1U && t_addend_info->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1] - { - lut["rhs_m0"] = "1"; - lut["rhs_n0"] = "1"; - lut["rhs_start_y"] = "0"; - lut["rhs_start_x"] = "0"; - } - else if(t_addend_info->dimension(1) == 1U && t_addend_info->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN] - { - lut["rhs_m0"] = "1"; - lut["rhs_n0"] = "N0"; - lut["rhs_start_y"] = "0"; - lut["rhs_start_x"] = "cout"; - } - else - { - ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions"); - } - } - else - { - lut["rhs_m0"] = "M0"; - lut["rhs_n0"] = "N0"; - lut["rhs_start_y"] = "mout"; - lut["rhs_start_x"] = "cout"; - } - return lut; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h deleted file mode 100644 index f8377457d3..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -class ClElementwiseKernelComponent : public IClKernelComponent -{ -public: - /** Construct a new Cl Elementwise Kernel Component object - * - * @param[in] blueprint Blueprint to which this component is added - * @param[in] desc Component descriptor - * @param[in] lhs Link to LHS tensor - * @param[in] rhs Link to RHS tensor - * @param[out] dst Link to DST tensor - * - * Support Level - * Data Type: F16, F32 - * Tensor Shape: Any shape of arbitrary dimension >= 1 and <= 4 - * Value Range: All - * Broadcasting: Only RHS tensor can be broadcasted into LHS. Only support broadcasting in dimension 1 and dimension 2 or all dimension 0, 1 and 2 - */ - ClElementwiseKernelComponent(ClKernelBlueprint *blueprint, const ClElementwiseKernelDescriptor &desc, const Link &lhs, const Link &rhs, const Link &dst) - : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _dst{ dst } - { - } - - ComponentType get_component_type() const override; - std::set get_headers_list() const override; - std::string get_component_code() const override; - Window get_window() const override; - CLBuildOptions generate_build_options() const override; - std::string generate_config_id() const override; - - virtual std::vector get_links() const override - { - return { _lhs, _rhs, _dst }; - } - - virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; - virtual void allocate_shared_vars(SharedVarTable &vtable) const override; - - virtual std::string name() const override - { - return "eltwise_add_" + std::to_string(id()); - } - -private: - ClElementwiseKernelDescriptor _desc{}; - Link _lhs{}; - Link _rhs{}; - Link _dst{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp deleted file mode 100644 index 0a20a8f600..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ComponentType ClFloorKernelComponent::get_component_type() const -{ - return ComponentType::Simple; -} -std::set ClFloorKernelComponent::get_headers_list() const -{ - return std::set { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" }; -} -Window ClFloorKernelComponent::get_window() const -{ - const ITensorInfo *src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id); - ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - - ARM_COMPUTE_ERROR_ON_NULLPTR(src_info, dst_info); - auto_init_if_empty(*dst_info, src_info->tensor_shape(), 1, src_info->data_type()); - - TensorShape output_shape = dst_info->tensor_shape(); - // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged - // This is in line with the collapsing convention used by Conv2d - output_shape.collapse(2U, 1U); - const unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); - - return win; -} -std::string ClFloorKernelComponent::get_component_code() const -{ - bool is_root = _blueprint->impl().group(_src.arg_id) == SharedVarGroup::Argument; - if(is_root) - { - return R"_( - //------------------ START KERNEL {{meta_kernel_id}} FLOOR --------------------- - // IN_0(src) {{src}} - // OUT(dst, accum) {{dst}} - TILE({{DATA_TYPE}}, M0, N0, {{dst}}); - { - TILE({{DATA_TYPE}}, M0, N0, src_tile); - - // Since mout maps to dimensions 1 (y) and dimension 2 (z) of the input tensor because of the collapsed window, bout maps to dimension 3 (w) - {{src}}_offset_first_element_in_bytes += bout * {{src}}_stride_w; - T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{src}}, cout, mout, 1, {{src}}_stride_y, src_tile); - - T_FLOOR({{DATA_TYPE}}, M0, N0, src_tile, {{dst}}); - } - //------------------ END KERNEL {{meta_kernel_id}} FLOOR --------------------- -)_"; - } - else - { - return R"_( - //------------------ START KERNEL {{meta_kernel_id}} FLOOR --------------------- - // IN_0/Out(Accumulator) {{acc}} - // output = floor(input) - { - T_FLOOR({{DATA_TYPE}}, M0, N0, {{acc}}, {{acc}}); - } - //------------------ END KERNEL {{meta_kernel_id}} FLOOR --------------------- -)_"; - } -} -CLBuildOptions ClFloorKernelComponent::generate_build_options() const -{ - CLBuildOptions build_opts{}; - const auto n0 = _blueprint->impl().get_execution_window().x().step(); - const auto m0 = _blueprint->impl().get_execution_window().y().step(); - const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - const unsigned int partial_store_n0 = dst_info->dimension(0) % n0; - build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - return build_opts; -} -std::string ClFloorKernelComponent::generate_config_id() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - std::string config_id{}; - config_id += lower_string(string_from_data_type(t_dst_info->data_type())); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(0)); - config_id += "_"; - config_id += support::cpp11::to_string(t_dst_info->dimension(1)); - config_id += "_"; - config_id += lower_string(string_from_data_layout(t_dst_info->data_layout())); - return config_id; -} -void ClFloorKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const -{ - vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); - vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); -} -ClFloorKernelComponent::TagLUT ClFloorKernelComponent::get_tag_lut(const SharedVarTable &vtable) const -{ - TagLUT lut{}; - const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - // Arguments and global shared variables - const bool is_root = _blueprint->impl().group(_src.arg_id) == SharedVarGroup::Argument; - - if(is_root) - { - lut["src"] = vtable.get(_src); - lut["dst"] = vtable.get(_dst); - } - else - { - lut["acc"] = vtable.get(_src); - } - - lut["meta_kernel_id"] = id(); - lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type()); - return lut; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h deleted file mode 100644 index e791b36382..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -class ClFloorKernelComponent : public IClKernelComponent -{ -public: - /** Construct a new Cl Floor Kernel Component object - * - * @param blueprint Blueprint to which this component is added - * @param src Link to SRC tensor - * @param dst Link to DST tensor - * - * Support Level - * Data Type: F16, F32 - * Tensor Shape: Any shape of arbitrary dimension >= 1 and <= 4 - * Value Range: All - */ - ClFloorKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) - : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } - { - } - - ComponentType get_component_type() const override; - std::set get_headers_list() const override; - std::string get_component_code() const override; - Window get_window() const override; - CLBuildOptions generate_build_options() const override; - std::string generate_config_id() const override; - - virtual std::vector get_links() const override - { - return { _src, _dst }; - } - - virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; - virtual void allocate_shared_vars(SharedVarTable &vtable) const override; - - virtual std::string name() const override - { - return "floor_" + std::to_string(id()); - } - -private: - Link _src{}; - Link _dst{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h deleted file mode 100644 index 3f99dd5553..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" - -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp deleted file mode 100644 index 7c805d5368..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -ComponentType ClStoreBlockBoundaryAwareKernelComponent::get_component_type() const -{ - return ComponentType::Store; -} - -std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const -{ - return R"_( - //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- - - __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{dst}}_stride_y); - -#if defined(REINTERPRET_OUTPUT_AS_3D) - // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we - // multiply dst_stride_z by DEPTH_GEMM3D - dst_addr += g_z * {{dst}}_stride_z * DEPTH_GEMM3D; - -#else // defined(REINTERPRET_OUTPUT_AS_3D) - - // Add offset for batched GEMM - dst_addr += g_z * {{dst}}_stride_z; - -#endif // defined(REINTERPRET_OUTPUT_AS_3D) - - STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, {{src}}, dst_addr, {{dst}}_stride_y, g_zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, g_cond_y, g_cond_x); - - //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- - -)_"; -} - -CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const -{ - auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - // auto tile_info = _blueprint->impl().get_tile_info(); - - CLBuildOptions build_opts{}; - - const auto n0 = _blueprint->impl().get_execution_window().x().step(); - const auto m0 = _blueprint->impl().get_execution_window().y().step(); - const auto partial_m0 = t_dst_info->dimension(0) % m0; - const auto partial_n0 = t_dst_info->dimension(1) % n0; - - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0)); - - return build_opts; -} - -void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const -{ - vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src"); - vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst"); -} - -ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const -{ - return { - { "meta_kernel_id", id() }, - { "src", vtable.get(_src) }, - { "dst", vtable.get(_dst) }, - }; -} - -ComponentType ClStoreIndirectWidthSelectKernelComponent::get_component_type() const -{ - return ComponentType::Store; -} - -std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() const -{ - return R"_( - //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- - { - // This also follows NHWC layout - // cout maps to global_id(0) maps to Channel - // mout maps to global_id(1) maps to Height and Weight (Collapsed Window) - // bout maps to global_id(3) maps to N / Batch - #define _IDST_WIDTH {{dst}}_w - #define _IDST_HEIGHT {{dst}}_h - TILE(uint, M0, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); - dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); - }) - - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - - T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); - - #undef _IDST_WIDTH - #undef _IDST_HEIGHT - //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- - } - -)_"; -} - -CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options() const -{ - CLBuildOptions build_opts{}; - - return build_opts; -} - -void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const -{ - vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src"); - vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst"); -} - -ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const -{ - TagLUT lut{}; - - // Arguments and global shared variables - lut["src"] = vtable.get(_src); - lut["dst"] = vtable.get(_dst); - - // Local build options - lut["meta_kernel_id"] = id(); - lut["DST_TENSOR_TYPE"] = "BUFFER"; - const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id()); - lut["DST_DATA_TYPE"] = dst_info->data_type(); - - return lut; -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h deleted file mode 100644 index e0b188dc8d..0000000000 --- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent -{ -public: - ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) - : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } - { - } - ComponentType get_component_type() const override; - std::string get_component_code() const override; - CLBuildOptions generate_build_options() const override; - TagLUT get_tag_lut(const SharedVarTable &vtable) const override; - void allocate_shared_vars(SharedVarTable &vtable) const override; - - virtual std::vector get_links() const override - { - return { _src, _dst }; - } - - virtual std::string name() const override - { - return ""; - } - -private: - Link _src{}; - Link _dst{}; -}; - -class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent -{ -public: - ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst) - : IClKernelComponent(blueprint), _src{ src }, _dst{ dst } - { - } - ComponentType get_component_type() const override; - std::string get_component_code() const override; - CLBuildOptions generate_build_options() const override; - virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override; - void allocate_shared_vars(SharedVarTable &vtable) const override; - - virtual std::vector get_links() const override - { - return { _src, _dst }; - } - - virtual std::string name() const override - { - return ""; - } - -private: - Link _src{}; - Link _dst{}; -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp deleted file mode 100644 index bd88afdb47..0000000000 --- a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/experimental/OperatorGraph.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" -#include "src/core/helpers/AutoConfiguration.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -namespace -{ -void check_dependency_graph_op_success(OperatorGraph &graph, const Status &status) -{ - if(!bool(status)) - { - graph.impl()->status = Status{ status.error_code(), "Cycles or loops are not allowed" }; - } -} - -// Check if there are more than one roots in the graph -void check_multiple_roots(OperatorGraph &graph) -{ - if(graph.impl()->graph.get_root_ops().size() > 1) - { - graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Multiple roots are not allowed" }; - } -} - -void check_execution_shape(OperatorGraph &graph, const ITensorInfo &dst_info) -{ - const auto roots = graph.impl()->graph.get_root_ops(); - for(auto root : roots) - { - // We assume exactly 1 dst tensor for all operators - const auto root_info = graph.impl()->tensors[graph.impl()->graph.dst_tensors(root)[0]]->get_tensor_info(); - for(unsigned int dim = 0; dim < root_info->num_dimensions(); ++dim) - { - if(root_info->dimension(dim) != dst_info.dimension(dim)) - { - graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot change execution space" }; - return; - } - } - } -} -} // namespace - -OpTensor::OpTensor(Id id) - : _id{ id } -{ -} - -OpTensor::Id OpTensor::id() const -{ - return _id; -} - -bool operator<(const OpTensor &t0, const OpTensor &t1) -{ - return t0.id() < t1.id(); -} - -Operator::Operator(Id id) - : _id{ id } -{ -} - -Operator::Id Operator::id() const -{ - return _id; -} - -bool operator<(const Operator &op0, const Operator &op1) -{ - return op0.id() < op1.id(); -} - -OperatorGraph::OperatorGraph() - : _impl{ std::make_unique() } -{ -} - -OperatorGraph::~OperatorGraph() = default; - -OperatorGraph::Implementation *OperatorGraph::impl() -{ - return _impl.get(); -} - -const OperatorGraph::Implementation *OperatorGraph::impl() const -{ - return _impl.get(); -} - -Status validate(const OperatorGraph &graph) -{ - return graph.impl()->status; -} - -OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info) -{ - auto id = graph.impl()->graph.add_tensor(); - OpTensor op_tensor(id); - graph.impl()->add_tensor(id, &info); - return op_tensor; -} - -Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst) -{ - // Check if map is empty as a complex operator can only be root - if(!graph.impl()->graph.get_root_ops().empty()) - { - graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot add multiple complex operators" }; - return Operator{}; - } - - std::pair status_id; - - if(bias.id() == -1) - { - status_id = graph.impl()->graph.add_operator({ input.id(), weights.id() }, { dst.id() }); - } - else - { - status_id = graph.impl()->graph.add_operator({ input.id(), weights.id(), bias.id() }, { dst.id() }); - } - - check_dependency_graph_op_success(graph, status_id.first); - - Operator op_node(status_id.second); - - // Infer TensorInfo - OpTensorContent *dst_tensor = graph.impl()->tensors[dst.id()].get(); - if(dst_tensor->get_tensor_info()->total_size() == 0) - { - auto src = graph.impl()->tensors[input.id()]->get_tensor_info(); - auto wts = graph.impl()->tensors[weights.id()]->get_tensor_info(); - auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wts->tensor_shape(), PadStrideInfo(desc.stride.x(), desc.stride.y(), desc.pad.left, - desc.pad.right, - desc.pad.top, desc.pad.bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType - - auto_init_if_empty(*(dst_tensor->get_tensor_info()), src->clone()->set_tensor_shape(shape)); - } - - // Check execution space - auto dst_info = dst_tensor->get_tensor_info(); - check_execution_shape(graph, *dst_info); - - ITensorDescPack tensors; - tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[input.id()].get()); - tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[weights.id()].get()); - if(bias.id() != -1) - { - tensors.add_const_tensor(ACL_SRC_2, graph.impl()->tensors[bias.id()].get()); - } - tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get()); - - graph.impl()->add_node(status_id.second, desc, tensors); - check_multiple_roots(graph); - - return op_node; -} - -Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst) -{ - return add_op_conv2d(graph, desc, input, weights, OpTensor(-1), dst); -} - -void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method) -{ - auto node = utils::cast::polymorphic_downcast(graph.impl()->operators[conv2d.id()].get()); - node->set_method(method); -} - -Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst) -{ - auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() }); - check_dependency_graph_op_success(graph, id.first); - - Operator op_node(id.second); - - // Infer TensorInfo - auto node_lhs = graph.impl()->tensors[lhs.id()]->get_tensor_info(); - auto node_rhs = graph.impl()->tensors[rhs.id()]->get_tensor_info(); - OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get(); - - if(node_dst->get_tensor_info()->total_size() == 0) - { - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*node_rhs, *node_lhs); - auto_init_if_empty(*(node_dst->get_tensor_info()), node_lhs->clone()->set_tensor_shape(broadcast_pair.first)); - } - - // Check execution space - auto dst_info = node_dst->get_tensor_info(); - check_execution_shape(graph, *dst_info); - - ITensorDescPack tensors; - tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get()); - tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get()); - tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get()); - graph.impl()->add_node(id.second, desc, tensors); - check_multiple_roots(graph); - - return op_node; -} - -Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst) -{ - auto id = graph.impl()->graph.add_operator({ src.id() }, { dst.id() }); - check_dependency_graph_op_success(graph, id.first); - - Operator op_node(id.second); - - // Infer TensorInfo - auto node_src = graph.impl()->tensors[src.id()]->get_tensor_info(); - OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get(); - - if(node_dst->get_tensor_info()->total_size() == 0) - { - auto_init_if_empty(*(node_dst->get_tensor_info()), *node_src); - } - - // Check execution space - auto dst_info = node_dst->get_tensor_info(); - check_execution_shape(graph, *dst_info); - - ITensorDescPack tensors; - tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[src.id()].get()); - tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get()); - graph.impl()->add_node(id.second, desc, tensors); - check_multiple_roots(graph); - - return op_node; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp deleted file mode 100644 index 4e57d66a1c..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -namespace -{ -std::vector> get_combinations(const std::vector &sorted_fgs) -{ - ARM_COMPUTE_ERROR_ON(sorted_fgs.size() <= 1); - std::vector> combo; - for(size_t i = 0; i < sorted_fgs.size() - 1; ++i) - { - for(size_t j = i + 1; j < sorted_fgs.size(); ++j) - { - combo.push_back(std::make_pair(sorted_fgs.at(i), sorted_fgs.at(j))); - } - } - return combo; -} -} // namespace -std::vector traverse(const ClKernelFusionGroup &group) -{ - std::vector kernels; - const auto sorted = group.graph.topological_sort(); - for(const auto &pack : sorted.second) - { - kernels.push_back(group.fused_kernels.at(pack.op)); - } - return kernels; -} - -std::vector traverse(const ClFusedKernelGraph &graph) -{ - std::vector kernels; - const auto sorted = graph.fg_dependency.topological_sort(); - for(const auto &pack : sorted.second) - { - kernels.push_back(graph.fusion_groups.at(pack.op).get()); - } - return kernels; -} - -std::vector traverse(ClFusedKernelGraph &graph) -{ - std::vector kernels; - const auto sorted = graph.fg_dependency.topological_sort(); - for(const auto &pack : sorted.second) - { - kernels.push_back(graph.fusion_groups.at(pack.op).get()); - } - return kernels; -} - -std::pair init_fusion_graph(const ClKernelGraph &kernel_graph) -{ - ClFusedKernelGraph fused_kernel_graph{}; - fused_kernel_graph.original_graph = &kernel_graph; // Create a copy of the original kernel graph - fused_kernel_graph.fg_dependency = DependencyGraph(); - // Initialize all fusion groups - for(const auto &kernel : traverse(kernel_graph)) - { - fused_kernel_graph.add_fusion_group({ kernel }); - } - return { Status{}, fused_kernel_graph }; -} - -Status fuse(ClFusedKernelGraph &fused_kernel_graph) -{ - // A naive fusion algorithm that's guaranteed to find optimal pattern if there are no branches - // If there are branches, the algorithm cannot guanrantee optimality as it doesn't perform any searches - - bool fusion_found = false; - do - { - fusion_found = false; - const auto sorted_fgs = traverse(fused_kernel_graph); - if(sorted_fgs.size() <= 1) - { - // Only one or zero fusion group, thus no need to perform fusion - return Status{}; - } - auto fgs_combo = get_combinations(sorted_fgs); - for(auto fgs : fgs_combo) - { - auto fg0 = fgs.first; - auto fg1 = fgs.second; - const auto st = fused_kernel_graph.can_fuse(*fg0, *fg1); - if(bool(st)) - { - const auto st = fused_kernel_graph.fuse(*fg0, *fg1); - if(!bool(st)) - { - return st; - } - fusion_found = true; - break; - } - } - } - while(fusion_found); - return Status{}; -} -Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg) -{ - Status st{}; - for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg.id)) - { - const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id); - - /// NOTE: dst tensor must have already been added to the blueprint at this point - ArgumentID dst_id; - st = add_tensor(bp, dst_t->desc, dst_id, dst_t->id); - if(!bool(st)) - { - return st; - } - /// NOTE: the extra dst tensor is needed as the store kcomp requires 2 tensors. But this is irrelevant to the fused kernel graph - /// since both tensors share the exact same info and kernel arg descriptor - ArgumentID dst_dst_id; - st = add_tensor(bp, dst_t->desc, dst_dst_id); - if(!bool(st)) - { - return st; - } - /// NOTE: Update the merge point map to link dst_dst_id with dst_t->id instead. - /// This is required because the get_arguments() returned by the blueprint returns the dst tensor added by the store component - st = update_merge_point(bp, dst_dst_id, dst_t->id); - if(!bool(st)) - { - return st; - } - st = add_kcomp_store(bp, fg.get_root_kernel()->config().store_type, dst_id, dst_dst_id); - if(!bool(st)) - { - return st; - } - } - return st; -} - -Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph) -{ - workload.context = ctx; - for(const auto &fg : traverse(fused_kernel_graph)) - { - ClKernelBlueprint bp{}; - for(const auto &kernel : traverse(*fg)) - { - const auto st = kernel->generate(bp); - if(!bool(st)) - { - return st; - } - } - auto st = set_tile_info(bp, fg->get_root_kernel()->config().tile_desc); - if(!bool(st)) - { - return st; - } - st = generate_store(bp, fused_kernel_graph, *fg); - if(!bool(st)) - { - return st; - } - - ClKernelCode code{}; - st = build(code, ClCodeBuilderContext{ ctx.gpu_info }, bp); - if(!bool(st)) - { - return st; - } - const auto bp_graph = get_dependency_graph(bp); - - // Get tensor info - std::vector workload_src_tensors{}; - for(const auto &src_t_id : fused_kernel_graph.fg_dependency.src_tensors(fg->id)) - { - const auto src_t = fused_kernel_graph.original_graph->get_tensor(src_t_id); - // Get corresponding kernel arg descriptor - const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(src_t->id)); - const auto kernel_t_id = workload.add_workload_tensor(src_t->desc, src_t->memory_type, src_t->memory_info, arg_desc, src_t->id); - workload_src_tensors.push_back(kernel_t_id); - } - std::vector workload_dst_tensors{}; - for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg->id)) - { - const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id); - // Get corresponding kernel arg descriptor - const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(dst_t->id)); - const auto kernel_t_id = workload.add_workload_tensor(dst_t->desc, dst_t->memory_type, dst_t->memory_info, arg_desc, dst_t->id); - workload_dst_tensors.push_back(kernel_t_id); - } - - workload.add_unit_workload(fg->get_root_kernel()->config().stage, code, workload_src_tensors, workload_dst_tensors); - } - - return Status{}; -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h deleted file mode 100644 index 2051f1b62f..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h +++ /dev/null @@ -1,452 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/DependencyGraph.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" -#include "support/DeepCopy.h" - -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClKernelFusionGroup; - -/** A const view of a subgraph of the @ref ClKernelGraph to be fused together - * - */ -struct ClKernelFusionGroup -{ -public: - using Id = DependencyGraph::Id; - - ClKernelFusionGroup() = default; - ClKernelFusionGroup(Id id) - : id{ id }, graph{}, fused_kernels{}, tensors{} - { - } - ~ClKernelFusionGroup() = default; - - void set_id(Id i) - { - id = i; - } - - Id add_fused_kernel(const ClKernel *kernel) - { - /// PRE: Acyclicity ensured by DependencyGraph - /// PRE: Connectedness ensured by DependencyGraph - /// PRE: Single-rootedness ensured by User - std::vector src_tensors; - for(const auto t : kernel->tensors().get_const_src_tensors()) - { - auto id = graph.add_tensor(t->id); - if(tensors.find(id) == tensors.end()) - { - tensors[id] = t; - } - src_tensors.push_back(id); - } - std::vector dst_tensors; - for(const auto t : kernel->tensors().get_const_dst_tensors()) - { - auto id = graph.add_tensor(t->id); - if(tensors.find(id) == tensors.end()) - { - tensors[id] = t; - } - dst_tensors.push_back(id); - } - auto id = graph.add_operator(src_tensors, dst_tensors); - fused_kernels[id.second] = kernel; - return id.second; - } - - const ClKernel *get_root_kernel() const - { - auto root_kernels = graph.get_root_ops(); - ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1); - return fused_kernels.at(root_kernels.at(0)); - } - - std::vector get_src_tensors() const - { - std::vector src_tensors; - for(auto tensor_id : graph.src_tensors()) - { - src_tensors.push_back(tensors.at(tensor_id)); - } - return src_tensors; - } - - std::vector get_dst_tensors() const - { - std::vector dst_tensors; - for(auto tensor_id : graph.dst_tensors()) - { - dst_tensors.push_back(tensors.at(tensor_id)); - } - return dst_tensors; - } - - friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) - { - return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors; - } - - Id id{}; - DependencyGraph graph{}; // A subgraph of the original ClKernelGraph - std::map fused_kernels{}; - std::map tensors{}; -}; - -std::vector traverse(const ClKernelFusionGroup &group); - -struct ClFusedKernelGraph -{ -public: - using Id = DependencyGraph::Id; - - using KernelFusionGroupMap = std::map>; - - ClFusedKernelGraph() = default; - ~ClFusedKernelGraph() = default; - ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default; - ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default; - ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default; - ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default; - - friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1) - { - /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison. - /// The fusion groups can already guarantee the equivalence of fusion - /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s - return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups; - } - - Id add_fusion_group(const std::vector &fused_kernels) - { - auto fg = utils::memory::make_deep_unique(); - for(const auto k : fused_kernels) - { - fg->add_fused_kernel(k); - } - const auto src_tensors = fg->get_src_tensors(); - const auto dst_tensors = fg->get_dst_tensors(); - std::vector inputs{}; - std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel) - { - return fg_dependency.add_tensor(kernel->id); - }); - std::vector outputs{}; - std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel) - { - return fg_dependency.add_tensor(kernel->id); - }); - const auto id = fg_dependency.add_operator(inputs, outputs); - fg->set_id(id.second); - fusion_groups[id.second] = std::move(fg); - return id.second; - } - - Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1) - { - /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold - ClKernelFusionGroup *fg_src{}; - ClKernelFusionGroup *fg_dst{}; - // Find fg_src (parent / root) and fg_dst (child / non-root) - if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) - { - fg_src = &fg0; - fg_dst = &fg1; - } - else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) - { - fg_src = &fg1; - fg_dst = &fg0; - } - else - { - return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; - } - - for(const auto &t : fg_dependency.src_tensors(fg_dst->id)) - { - if(!is_in(t, fg_dependency.dst_tensors(fg_src->id))) - { - // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src - - // Before: - // fg_src - // | - // .. t1 - // | | - // -> fg_dst <- - // - // After: - // fg_src <---t1 - // - const auto st = link_src_tensors(fg_src->id, { t }); - if(!bool(st)) - { - return st; - } - } - else - { - const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t); - if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id) - { - // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst - // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3)) - - // Before: - // fg_src - // | - // t0 - // | - // -> fg_dst - // - // After: - // fg_src - // - const auto st = remove_fg_tensor(t); - if(!bool(st)) - { - return st; - } - } - else - { - // If the tensors ARE in between fg_src and fg_dst - // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src - - // Before: - // fg_src - // | - // t0 - // | - // |----------- - // | | - // -> fg_dst -> fg_other - // - // After: - // fg_src - // | - // t0 - // | - // -> fg_other - // - - // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an - // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory - // so that any unfused fg's (fg_other in this case) can read it. - // So all this means that we not only can STORE the tensors at the "end" of a fusion group, - // but also any other tensors that are not source tensors. And all tensors that are STORED (exported), - // can be termed "dst tensors" to a fusion group - void(); - } - } - } - - for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) - { - // Link any outgoing tensors of fg_dst to fg_src - - // Before: - // fg_src - // | - // .. - // | - // -> fg_dst - // | - // |-------- - // | | - // |-> t0 |-> t1 - // - // After: - // fg_src - // | - // |-------- - // | | - // |-> t0 |-> t1 - // - const auto st = link_dst_tensors(fg_src->id, { t }); - if(!bool(st)) - { - return st; - } - } - - // Merge fg_dst's graph into fg_src's graph - for(const auto kernel : traverse(*fg_dst)) - { - fg_src->add_fused_kernel(kernel); - } - - const auto st = remove_fg(fg_dst->id); - return st; - } - Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const - { - /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel - /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels) - /// Note that this does not apply to fusion groups - /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's - /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group - /// INV0: All Fusion groups have a single root - /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph - /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph - /// INV3: All non-roots are Simple kernels - /// INV4: All non roots' dst tensors have the same shape as that of the root kernel - /// INV5: All kernels within a fusion group have the same UnitWorkloadStage - const ClKernelFusionGroup *fg_src {}; - const ClKernelFusionGroup *fg_dst{}; - - // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other - // This guarantess INV0 - // This also finds fg_src (parent / root) and fg_dst (child / non-root) - if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id))) - { - fg_src = &fg0; - fg_dst = &fg1; - } - else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id))) - { - fg_src = &fg1; - fg_dst = &fg0; - } - else - { - return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" }; - } - - // Find unconnected tensors between fg_src and fg_dst - std::vector unconnected_tensors{}; - for(const auto &t : fg_dependency.dst_tensors(fg_src->id)) - { - if(!is_in(t, fg_dependency.src_tensors(fg_dst->id))) - { - unconnected_tensors.push_back(t); - } - } - - // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst - // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups - for(const auto &t : unconnected_tensors) - { - if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id)) - { - return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" }; - } - } - - // Check 2: All non-root fgs are simple. Ensure INV3 - if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple) - { - return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" }; - } - - // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4 - const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id); - ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor) - const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc; - - for(const auto &t : fg_dependency.dst_tensors(fg_dst->id)) - { - const auto t_info = original_graph->get_tensor(t)->desc; - if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0)) - { - return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" }; - } - } - - // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5 - if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage)) - { - return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" }; - } - - return Status{}; - } - - const ClKernelGraph *original_graph{}; - DependencyGraph fg_dependency{}; - KernelFusionGroupMap fusion_groups{}; - // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups. - -private: - Status link_src_tensors(Id fg, const std::vector &src_tensors) - { - for(auto t : src_tensors) - { - fg_dependency.link_input(fg, t); - } - return Status{}; - } - Status link_dst_tensors(Id fg, const std::vector &dst_tensors) - { - for(auto t : dst_tensors) - { - fg_dependency.link_output(fg, t); - } - return Status{}; - } - Status remove_fg(Id fg) - { - fg_dependency.remove_operator(fg); - fusion_groups.erase(fg); - return Status{}; - } - Status remove_fg_tensor(Id tensor) - { - fg_dependency.remove_tensor(tensor); - return Status{}; - } -}; - -std::vector traverse(const ClFusedKernelGraph &graph); -std::vector traverse(ClFusedKernelGraph &graph); - -std::pair init_fusion_graph(const ClKernelGraph &kernel_graph); - -Status fuse(ClFusedKernelGraph &fused_kernel_graph); - -Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg); - -Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph); - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h deleted file mode 100644 index f10e97e3e9..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H - -#include "arm_compute/core/experimental/OperatorGraph.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClDirectConv2dKernelDescriptor -{ - friend bool operator==(const ClDirectConv2dKernelDescriptor &desc0, const ClDirectConv2dKernelDescriptor &desc1) - { - return desc0.conv2d == desc1.conv2d; - } - Conv2dDescriptor conv2d{}; -}; - -struct ClElementwiseKernelDescriptor -{ - friend bool operator==(const ClElementwiseKernelDescriptor &desc0, const ClElementwiseKernelDescriptor &desc1) - { - return desc0.eltwise == desc1.eltwise; - } - ElementwiseDescriptor eltwise{}; -}; - -struct ClFloorKernelDescriptor -{ - friend bool operator==(const ClFloorKernelDescriptor &desc0, const ClFloorKernelDescriptor &desc1) - { - return desc0.floor == desc1.floor; - } - FloorDescriptor floor{}; -}; - -struct ClActivationKernelDescriptor -{ - friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &) - { - return true; - } -}; - -enum class ClippingStrategy -{ - TOP_LEFT, - TOP_RIGHT, - BOTTOM_LEFT, - BOTTOM_RIGHT, -}; -/** Component: Store */ -struct TileDescriptor -{ - Size2D tile_dims{}; - Size2D boundaries{}; - ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT }; - - TileDescriptor() - { - } - - TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip) - : tile_dims(dims), boundaries(bound), clipping(clip) - { - } - - bool empty() const - { - return (tile_dims.area() == 0) || (boundaries.area() == 0); - } - friend bool operator==(const TileDescriptor &tile0, const TileDescriptor &tile1) - { - return tile0.tile_dims == tile1.tile_dims && tile0.boundaries == tile1.boundaries && tile0.clipping == tile1.clipping; - } -}; -enum class StoreType -{ - VStore, - VStorePartial, - StoreRow, - ConvertStoreRow, - StoreBlock, - ConvertStoreBlock, - StoreRowPartial, - StoreBlockPartial, - StoreBlockBoundaryAware, - StoreVectorSelect, - TStoreIndirectWidthSelect -}; -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp deleted file mode 100644 index cab51a2ce6..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "src/core/CL/CLValidate.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" - -#include "support/Cast.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -Status ClDirectConv2dKernel::generate(ClKernelBlueprint &bp) const -{ - const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); - const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst); - ArgumentID input_id; - add_tensor(bp, input->desc, input_id, input->id); - ArgumentID weight_id; - add_tensor(bp, weight->desc, weight_id, weight->id); - ArgumentID bias_id = g_arg_placeholder; - if(bias != nullptr) - { - add_tensor(bp, bias->desc, bias_id, bias->id); - } - ArgumentID dst_id; - add_tensor(bp, dst->desc, dst_id, dst->id); - - add_kcomp_direct_conv2d(bp, desc, input_id, weight_id, bias_id, dst_id); - return Status{}; -} -Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc) -{ - // 1. Check validity - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - // Matching data type - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); - } - - // Matching data layout - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, biases); - } - - // All tensor infos are initialized - ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().total_size() == 0); - } - // Device requirements are met - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - // weights shape is correct - const DataLayout data_layout = src->data_layout(); - const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - - // dst shape is correct - PadStrideInfo legacy_pad_stride(conv2d_desc.conv2d.stride.x(), conv2d_desc.conv2d.stride.y(), conv2d_desc.conv2d.pad.left, conv2d_desc.conv2d.pad.right, conv2d_desc.conv2d.pad.top, - conv2d_desc.conv2d.pad.bottom, DimensionRoundingType{}); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride)); - - // biases shape is correct - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), - "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); - } - - // 2. Check support level - // Data type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); - - return Status{}; -} - -bool ClDirectConv2dKernel::operator==(const ClKernel &other) const -{ - const auto converted = *utils::cast::polymorphic_downcast(&other); - return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; -} - -Status ClElementwiseKernel::generate(ClKernelBlueprint &bp) const -{ - const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); - ArgumentID lhs_id; - add_tensor(bp, lhs->desc, lhs_id, lhs->id); - ArgumentID rhs_id; - add_tensor(bp, rhs->desc, rhs_id, rhs->id); - ArgumentID dst_id; - add_tensor(bp, dst->desc, dst_id, dst->id); - - add_kcomp_eltwise_op(bp, desc, lhs_id, rhs_id, dst_id); - return Status{}; -} - -Status ClElementwiseKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst) -{ - // 1. Check validity - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); - - // Matching data type - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); - - // Matching data layout - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); - - // All tensor infos are initialized - ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - - // Device requirements are met - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); - - const bool in_place = (lhs == dst) || (rhs == dst); - const bool src0_in_place = in_place && (lhs == dst); - - // dst shape is correct - const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); - if(in_place) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src0_in_place ? lhs->tensor_shape() : rhs->tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); - } - - // 2. Check support level - - // Data type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); - - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC); - - return Status{}; -} - -bool ClElementwiseKernel::operator==(const ClKernel &other) const -{ - const auto converted = *utils::cast::polymorphic_downcast(&other); - return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; -} - -Status ClFloorKernel::generate(ClKernelBlueprint &bp) const -{ - const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ArgumentID src_id; - add_tensor(bp, src->desc, src_id, src->id); - ArgumentID dst_id; - add_tensor(bp, dst->desc, dst_id, dst->id); - - add_kcomp_floor(bp, desc, src_id, dst_id); - return Status{}; -} - -Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - // 1. Check validity - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - - // Matching data type - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - - // Matching data layout - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - - // All tensor infos are initialized - ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - - // Device requirements are met - ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - - // dst shape is correct - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(src->tensor_shape(), dst->tensor_shape(), 0), "Wrong shape for dst"); - - // 2. Check support level - - // Data type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16); - - // Data layout - ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); - - return Status{}; -} - -bool ClFloorKernel::operator==(const ClKernel &other) const -{ - const auto converted = *utils::cast::polymorphic_downcast(&other); - return config() == other.config() && tensors() == other.tensors() && desc == converted.desc; -} - -std::vector traverse(const ClKernelGraph &graph) -{ - std::vector kernels; - const auto sorted = graph.graph.topological_sort(); - for(const auto &pack : sorted.second) - { - kernels.push_back(graph.kernels.at(pack.op).get()); - } - return kernels; -} - -std::vector traverse(ClKernelGraph &graph) -{ - std::vector kernels; - const auto sorted = graph.graph.topological_sort(); - for(const auto &pack : sorted.second) - { - kernels.push_back(graph.kernels.at(pack.op).get()); - } - return kernels; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h deleted file mode 100644 index c3580cfaca..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/ClWorkload.h" -#include "arm_compute/core/experimental/DependencyGraph.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h" -#include "support/DeepCopy.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClKernelGraph; -class ClKernelBlueprint; - -enum class Complexity -{ - Simple, - Complex -}; - -/** Configurations for ClKernel - * - */ -struct ClKernelConfig -{ - UnitWorkloadStage stage{}; - TileDescriptor tile_desc{}; - StoreType store_type{}; - friend bool operator==(const ClKernelConfig &config0, const ClKernelConfig &config1) - { - return config0.stage == config1.stage && config0.tile_desc == config1.tile_desc && config0.store_type == config1.store_type; - } -}; - -struct ClKernelTensor -{ -public: - using Id = DependencyGraph::Id; - ClKernelTensor() = default; - ClKernelTensor(Id id, ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info) - : id{ id }, desc{ desc }, memory_type{ memory_type }, memory_info{ memory_info } - { - } - bool operator==(const ClKernelTensor &other) const - { - return desc == other.desc; - } - - Id id{}; - ITensorInfo *desc{}; - MemoryType memory_type{}; - AuxMemoryInfo memory_info{}; -}; - -struct ClKernel -{ -public: - using Id = DependencyGraph::Id; - ClKernel() = default; - virtual ~ClKernel() = default; - ClKernel(const ClKernel &kernel) = default; - ClKernel &operator=(const ClKernel &kernel) = default; - ClKernel(ClKernel &&kernel) = default; - ClKernel &operator=(ClKernel &&kernel) = default; - ClKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ITensorDescPack &tensors) - : _graph{ graph }, _id{ id }, _config{ config }, _tensors{ tensors } - { - } - virtual bool operator==(const ClKernel &other) const = 0; - virtual Complexity complexity() const = 0; - virtual Status generate(ClKernelBlueprint &bp) const = 0; - Id id() const - { - return _id; - } - ITensorDescPack tensors() const - { - return _tensors; - } - ClKernelConfig config() const - { - return _config; - } - -protected: - const ClKernelGraph *_graph {}; - Id _id{}; - ClKernelConfig _config{}; - ITensorDescPack _tensors{}; -}; - -struct ClDirectConv2dKernel : public ClKernel -{ -public: - Complexity complexity() const override - { - return Complexity::Complex; - } - ClDirectConv2dKernel() = default; - ~ClDirectConv2dKernel() override = default; - ClDirectConv2dKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig config, const ClDirectConv2dKernelDescriptor &desc, const ITensorDescPack tensors) - : ClKernel{ graph, id, config, tensors }, desc{ desc } - { - } - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc); - bool operator==(const ClKernel &other) const override; - Status generate(ClKernelBlueprint &bp) const override; - - ClDirectConv2dKernelDescriptor desc{}; -}; - -struct ClElementwiseKernel : public ClKernel -{ -public: - Complexity complexity() const override - { - return Complexity::Simple; - } - ClElementwiseKernel() = default; - ~ClElementwiseKernel() override = default; - ClElementwiseKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClElementwiseKernelDescriptor &desc, const ITensorDescPack tensors) - : ClKernel{ graph, id, config, tensors }, desc{ desc } - { - } - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst); - bool operator==(const ClKernel &other) const override; - Status generate(ClKernelBlueprint &bp) const override; - - ClElementwiseKernelDescriptor desc{}; -}; - -struct ClFloorKernel : public ClKernel -{ -public: - Complexity complexity() const override - { - return Complexity::Simple; - } - ClFloorKernel() = default; - ~ClFloorKernel() override = default; - ClFloorKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClFloorKernelDescriptor &desc, const ITensorDescPack tensors) - : ClKernel{ graph, id, config, tensors }, desc{ desc } - { - } - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - bool operator==(const ClKernel &other) const override; - Status generate(ClKernelBlueprint &bp) const override; - - ClFloorKernelDescriptor desc{}; -}; - -struct ClKernelGraph -{ -public: - using Id = DependencyGraph::Id; - using KernelMap = std::map>; - using KernelTensorMap = std::map>; - - ClKernelGraph() = default; - ~ClKernelGraph() = default; - - friend bool operator==(const ClKernelGraph &graph0, const ClKernelGraph &graph1) - { - return graph0.graph == graph1.graph && graph0.kernels == graph1.kernels && graph0.tensors == graph1.tensors; - } - - Status add_kernel_tensor(ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info, Id &tensor_id, Id merge_point = DependencyGraph::empty_id()) - { - tensor_id = graph.add_tensor(merge_point); - if(tensors.find(tensor_id) == tensors.end()) - { - tensors[tensor_id] = utils::memory::make_deep_unique(tensor_id, desc, memory_type, memory_info); - } - return Status{}; - } - - template - Status add_kernel(const ClKernelConfig &config, const KernelDescT &desc, const ITensorDescPack &tensors, Id &kernel_id) - { - const auto src_tensors = tensors.get_const_src_tensors(); - const auto dst_tensors = tensors.get_const_dst_tensors(); - std::vector src_tensor_ids{}; - std::vector dst_tensor_ids{}; - for(const auto &t : src_tensors) - { - src_tensor_ids.push_back(t->id); - } - for(const auto &t : dst_tensors) - { - dst_tensor_ids.push_back(t->id); - } - kernel_id = graph.add_operator(src_tensor_ids, dst_tensor_ids).second; - auto k = utils::memory::make_deep_unique(this, kernel_id, config, desc, tensors); - kernels[kernel_id] = std::move(k); - return Status{}; - } - - ClKernel *get_kernel(Id id) - { - return kernels.at(id).get(); - } - const ClKernel *get_kernel(Id id) const - { - return kernels.at(id).get(); - } - - ClKernelTensor *get_tensor(Id id) - { - return tensors.at(id).get(); - } - const ClKernelTensor *get_tensor(Id id) const - { - return tensors.at(id).get(); - } - - DependencyGraph graph{}; - KernelMap kernels{}; - KernelTensorMap tensors{}; -}; -using Id = DependencyGraph::Id; - -std::vector traverse(const ClKernelGraph &graph); -std::vector traverse(ClKernelGraph &graph); - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp deleted file mode 100644 index dcada4f64b..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/experimental/ClWorkload.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx) -{ - workload.context = ctx; - ClKernelGraph kernel_graph; - workload.status = validate(op_graph); - ARM_COMPUTE_RETURN_ON_ERROR(workload.status); - workload.status = translate(kernel_graph, *op_graph.impl()); - ARM_COMPUTE_RETURN_ON_ERROR(workload.status); - ClFusedKernelGraph fused_k_graph; - std::tie(workload.status, fused_k_graph) = init_fusion_graph(kernel_graph); - ARM_COMPUTE_RETURN_ON_ERROR(workload.status); - workload.status = fuse(fused_k_graph); - ARM_COMPUTE_RETURN_ON_ERROR(workload.status); - workload.status = generate(workload, ctx, fused_k_graph); - ARM_COMPUTE_RETURN_ON_ERROR(workload.status); - - // Get operator tensor id to workload tensor id map - const auto op_tensor_to_kernel_tensor = fused_k_graph.original_graph->graph.get_merge_points(); - const auto kernel_tensor_to_workload_tensor = workload.graph.get_merge_points(); - for(const auto op_t : op_graph.impl()->graph.src_tensors()) - { - const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t); - const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t); - workload.op_tensor_id_lut[workload_t] = op_t; - } - for(const auto op_t : op_graph.impl()->graph.dst_tensors()) - { - const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t); - const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t); - workload.op_tensor_id_lut[workload_t] = op_t; - } - return workload.status; -} -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp deleted file mode 100644 index 7350255ebe..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/experimental/DependencyGraph.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -DependencyGraph::DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map merge_points) - : _adj_src_tensors{ adj_src_tensors }, _adj_dst_tensors{ adj_dst_tensors }, _adj_src_ops{ adj_src_ops }, _adj_dst_ops{ adj_dst_ops }, _merge_to_internal{ merge_points }, _operator_id{}, _tensor_id{} -{ -} -DependencyGraph::DependencyGraph(const std::vector &imported_tensors) - : _adj_src_tensors{}, _adj_dst_tensors{}, _adj_src_ops{}, _adj_dst_ops{}, _merge_to_internal{}, _operator_id{}, _tensor_id{} -{ - for(auto t : imported_tensors) - { - _adj_src_ops[t] = {}; - _adj_dst_ops[t] = {}; - } -} - -Status DependencyGraph::update_merge_point(Id t_id, Id merge_point) -{ - if(_merge_to_internal.find(merge_point) == _merge_to_internal.end()) - { - return Status{ ErrorCode::RUNTIME_ERROR, "Merge point does not exist" }; - } - _merge_to_internal[merge_point] = t_id; - return Status{}; -} - -DependencyGraph::Id DependencyGraph::add_tensor(Id merge_tensor) -{ - Id new_tensor{ empty_id() }; - if(merge_tensor != empty_id()) - { - if(_merge_to_internal.find(merge_tensor) != _merge_to_internal.end()) - { - new_tensor = _merge_to_internal[merge_tensor]; - } - else - { - new_tensor = insert_new_tensor(); - _merge_to_internal[merge_tensor] = new_tensor; - } - } - else - { - new_tensor = insert_new_tensor(); - } - return new_tensor; -} - -void DependencyGraph::remove_tensor(Id tensor) -{ - for(auto src_op : _adj_src_ops.at(tensor)) - { - auto &dst_tensors = _adj_dst_tensors.at(src_op); - dst_tensors.erase( - std::remove(std::begin(dst_tensors), std::end(dst_tensors), tensor), - std::end(dst_tensors)); - } - for(auto dst_op : _adj_dst_ops.at(tensor)) - { - auto &src_tensors = _adj_src_tensors.at(dst_op); - src_tensors.erase( - std::remove(std::begin(src_tensors), std::end(src_tensors), tensor), - std::end(src_tensors)); - } - _adj_src_ops.erase(tensor); - _adj_dst_ops.erase(tensor); -} - -std::pair DependencyGraph::add_operator(const std::vector &inputs, const std::vector &outputs) -{ - Id new_op = insert_new_op(); - for(Id tensor : inputs) - { - link_input(new_op, tensor); - } - for(Id tensor : outputs) - { - link_output(new_op, tensor); - } - - // Use topological sort in order to detect possible loops / cycles. - // NOTE: This is unscalable. We'll need to have a better way of detecting loops or relax this invariant during operation, and add a validate method instead - return std::pair(topological_sort().first, new_op); -} - -void DependencyGraph::remove_operator(Id op) -{ - for(auto src_tensor : _adj_src_tensors.at(op)) - { - auto &dst_ops = _adj_dst_ops.at(src_tensor); - dst_ops.erase( - std::remove(std::begin(dst_ops), std::end(dst_ops), op), - std::end(dst_ops)); - } - for(auto dst_tensor : _adj_dst_tensors.at(op)) - { - auto &src_ops = _adj_src_ops.at(dst_tensor); - src_ops.erase( - std::remove(std::begin(src_ops), std::end(src_ops), op), - std::end(src_ops)); - } - _adj_src_tensors.erase(op); - _adj_dst_tensors.erase(op); -} - -std::map DependencyGraph::get_merge_points() const -{ - return _merge_to_internal; -} - -std::vector DependencyGraph::get_root_ops() const -{ - std::vector ops{}; - const auto op_list = all_ops(); - - for(auto op : op_list) - { - if(src_ops(op).empty()) - { - ops.emplace_back(op); - } - } - return ops; -} - -std::vector DependencyGraph::get_dst_ops() const -{ - std::vector ops{}; - const auto op_list = all_ops(); - - for(auto op : op_list) - { - if(dst_ops(op).empty()) - { - ops.emplace_back(op); - } - } - return ops; -} - -std::vector DependencyGraph::src_tensors(Id op) const -{ - ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - return _adj_src_tensors.at(op); -} - -std::vector DependencyGraph::dst_tensors(Id op) const -{ - ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - return _adj_dst_tensors.at(op); -} - -std::vector DependencyGraph::src_tensors() const -{ - std::vector tensors; - for(auto tensor_src_ops : _adj_src_ops) - { - if(tensor_src_ops.second.empty()) - tensors.push_back(tensor_src_ops.first); - } - return tensors; -} - -std::vector DependencyGraph::dst_tensors() const -{ - std::vector tensors; - for(auto tensor_dst_ops : _adj_dst_ops) - { - if(tensor_dst_ops.second.empty()) - tensors.push_back(tensor_dst_ops.first); - } - return tensors; -} - -std::vector DependencyGraph::src_ops_from_tensor(Id tensor) const -{ - return _adj_src_ops.at(tensor); -} -std::vector DependencyGraph::dst_ops_from_tensor(Id tensor) const -{ - return _adj_dst_ops.at(tensor); -} - -std::vector DependencyGraph::all_ops() const -{ - std::vector ops{}; - std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it) - { - return it.first; - }); - return ops; -} - -bool DependencyGraph::path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const -{ - for(auto child_op : dst_ops_from_tensor(src_tensor)) - { - if(path_exists_from_op_to_op(child_op, dst_op)) - { - return true; - } - } - return false; -} - -bool DependencyGraph::path_exists_from_op_to_op(Id src_op, Id dst_op) const -{ - if(src_op == dst_op) - { - return true; - } - if(is_in(src_op, get_dst_ops())) - { - return false; - } - for(auto child_tensor : dst_tensors(src_op)) - { - if(path_exists_from_tensor_to_op(child_tensor, dst_op)) - { - return true; - } - } - return false; -} - -std::vector DependencyGraph::all_tensors() const -{ - std::vector tensors{}; - std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it) - { - return it.first; - }); - return tensors; -} - -unsigned int DependencyGraph::number_of_ops() const -{ - return _adj_src_tensors.size(); -} - -unsigned int DependencyGraph::number_of_tensors() const -{ - return _adj_src_ops.size(); -} - -DependencyGraph::Id DependencyGraph::insert_new_tensor() -{ - Id new_tensor = _tensor_id.alloc(); - _adj_src_ops[new_tensor] = {}; - _adj_dst_ops[new_tensor] = {}; - return new_tensor; -} -DependencyGraph::Id DependencyGraph::insert_new_op() -{ - Id new_op = _operator_id.alloc(); - _adj_src_tensors[new_op] = {}; - _adj_dst_tensors[new_op] = {}; - return new_op; -} -void DependencyGraph::link_input(Id op, Id in_tensor) -{ - ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - ARM_COMPUTE_ERROR_ON(!tensor_exists(in_tensor)); - ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor)); - _adj_src_tensors[op].push_back(in_tensor); - _adj_dst_ops[in_tensor].push_back(op); -} -void DependencyGraph::link_output(Id op, Id out_tensor) -{ - ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - ARM_COMPUTE_ERROR_ON(!tensor_exists(out_tensor)); - ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor)); - _adj_dst_tensors[op].push_back(out_tensor); - _adj_src_ops[out_tensor].push_back(op); -} -bool DependencyGraph::tensor_exists(Id tensor) const -{ - return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end(); -} -bool DependencyGraph::operator_exists(Id op) const -{ - return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); -} - -bool DependencyGraph::is_src_tensor(Id tensor) const -{ - if(!tensor_exists(tensor)) - { - return false; - } - return _adj_src_ops.at(tensor).empty(); -} - -bool DependencyGraph::is_dst_tensor(Id tensor) const -{ - if(!tensor_exists(tensor)) - { - return false; - } - return _adj_dst_ops.at(tensor).empty(); -} -bool DependencyGraph::is_src_tensor_of(Id op, Id tensor) const -{ - if(!operator_exists(op) || !tensor_exists(tensor)) - { - return false; - } - const auto op_inputs = src_tensors(op); - return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end(); -} -bool DependencyGraph::is_dst_tensor_of(Id op, Id tensor) const -{ - if(!operator_exists(op) || !tensor_exists(tensor)) - { - return false; - } - const auto op_outputs = dst_tensors(op); - return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end(); -} -bool DependencyGraph::are_connected(Id op, Id tensor) const -{ - return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor); -} -std::vector DependencyGraph::src_ops(Id op) const -{ - ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - std::vector ops{}; - for(Id src_tensor : src_tensors(op)) - { - ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); - } - return ops; -} - -std::vector DependencyGraph::dst_ops(Id op) const -{ - ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - std::vector ops{}; - for(Id dst_tensor : _adj_dst_tensors.at(op)) - { - ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); - } - return ops; -} - -std::pair> DependencyGraph::topological_sort() const -{ - // Incident degree (number of source operators to an op) - std::map in_degree{}; - std::set visited_ops{}; - std::deque zero_in_degree_ops{}; - std::vector sorted_op_packs{}; - for(auto op : all_ops()) - { - const auto degree = src_ops(op).size(); - in_degree[op] = degree; - if(degree == 0) - { - zero_in_degree_ops.push_back(op); - visited_ops.insert(op); - } - } - - while(!zero_in_degree_ops.empty()) - { - const Id op = zero_in_degree_ops.front(); - zero_in_degree_ops.pop_front(); - sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) }); - - for(const auto next_op : dst_ops(op)) - { - if(in_degree[next_op] > 0) - { - in_degree[next_op]--; - } - if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end()) - { - zero_in_degree_ops.push_back(next_op); - visited_ops.insert(op); - } - } - } - - // If there are remaining ops with in_degree > 0, then it's indication that there are cycles in the graph - Status st{}; - if(sorted_op_packs.size() != number_of_ops()) - { - st = Status{ ErrorCode::RUNTIME_ERROR, "Cycles or loops are not allowed in a DependencyGraph" }; - } - return std::make_pair(st, sorted_op_packs); -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h deleted file mode 100644 index a4e4eaa3bb..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H - -#include -#include -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -template -class ITensorDescPack -{ -public: - struct PackElement - { - PackElement() = default; - ~PackElement() = default; - PackElement(const PackElement &) = default; - PackElement &operator=(const PackElement &) = default; - PackElement(PackElement &&) = default; - PackElement &operator=(PackElement &&) = default; - PackElement(int id, TDesc *tensor) - : id(id), tensor(tensor), ctensor(nullptr) - { - } - PackElement(int id, const TDesc *ctensor) - : id(id), tensor(nullptr), ctensor(ctensor) - { - } - - int id{ -1 }; - TDesc *tensor{ nullptr }; - const TDesc *ctensor{ nullptr }; - - friend bool operator==(const PackElement &elem0, const PackElement &elem1) - { - const bool same_ctensor = (elem0.tensor == nullptr && elem1.tensor == nullptr && elem0.ctensor != nullptr && elem1.ctensor != nullptr && *elem0.ctensor == *elem1.ctensor); - const bool same_tensor = (elem0.ctensor == nullptr && elem1.ctensor == nullptr && elem0.tensor != nullptr && elem1.tensor != nullptr && *elem0.tensor == *elem1.tensor); - - return elem0.id == elem1.id && (same_ctensor || same_tensor); - } - }; - -public: - /** Default Constructor */ - ITensorDescPack() = default; - ~ITensorDescPack() = default; - ITensorDescPack(const ITensorDescPack &other) = default; - ITensorDescPack &operator=(const ITensorDescPack &other) = default; - ITensorDescPack(ITensorDescPack &&other) = default; - ITensorDescPack &operator=(ITensorDescPack &&other) = default; - /** Initializer list Constructor */ - ITensorDescPack(std::initializer_list l) - : _pack{} - { - for(auto &e : l) - { - _pack[e.id] = e; - } - } - /** Add tensor to the pack - * - * @param[in] id ID/type of the tensor to add - * @param[in] tensor Tensor to add - */ - void add_tensor(int id, TDesc *tensor) - { - _pack[id] = PackElement(id, tensor); - } - - /** Add const tensor to the pack - * - * @param[in] id ID/type of the tensor to add - * @param[in] tensor Tensor to add - */ - void add_const_tensor(int id, const TDesc *tensor) - { - _pack[id] = PackElement(id, tensor); - } - /** Get tensor of a given id from the pac - * - * @param[in] id ID of tensor to extract - * - * @return The pointer to the tensor if exist and is non-const else nullptr - */ - TDesc *get_tensor(int id) - { - auto it = _pack.find(id); - return it != _pack.end() ? it->second.tensor : nullptr; - } - /** Get constant tensor of a given id - * - * @param[in] id ID of tensor to extract - * - * @return The pointer to the tensor if exist and is const else nullptr - */ - const TDesc *get_const_tensor(int id) const - { - auto it = _pack.find(id); - if(it != _pack.end()) - { - return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; - } - return nullptr; - } - /** Remove the tensor stored with the given id - * - * @param[in] id ID of tensor to remove - */ - void remove_tensor(int id) - { - _pack.erase(id); - } - /** Pack size accessor - * - * @return Number of tensors registered to the pack - */ - size_t size() const - { - return _pack.size(); - } - /** Checks if pack is empty - * - * @return True if empty else false - */ - bool empty() const - { - return _pack.empty(); - } - - /** Get the ACL_SRC_* tensors - * - * @return std::vector - */ - std::vector get_src_tensors() - { - std::vector src_tensors{}; - for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) - { - auto tensor = get_tensor(id); - if(tensor != nullptr) - { - src_tensors.push_back(tensor); - } - } - return src_tensors; - } - /** Get the const ACL_SRC_* tensors - * - * @return std::vector - */ - std::vector get_const_src_tensors() const - { - std::vector src_tensors{}; - for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) - { - auto tensor = get_const_tensor(id); - if(tensor != nullptr) - { - src_tensors.push_back(tensor); - } - } - return src_tensors; - } - /** Get the ACL_DST_* tensors - * - * @return std::vector - */ - std::vector get_dst_tensors() - { - std::vector dst_tensors{}; - for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) - { - auto tensor = get_tensor(id); - if(tensor != nullptr) - { - dst_tensors.push_back(tensor); - } - } - return dst_tensors; - } - /** Get the const ACL_DST_* tensors - * - * @return std::vector - */ - std::vector get_const_dst_tensors() const - { - std::vector dst_tensors{}; - for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) - { - auto tensor = get_const_tensor(id); - if(tensor != nullptr) - { - dst_tensors.push_back(tensor); - } - } - return dst_tensors; - } - - friend bool operator==(const ITensorDescPack &pack0, const ITensorDescPack &pack1) - { - return pack0._pack == pack1._pack; - } - -private: - std::unordered_map _pack{}; /**< Container with the packed tensors */ -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp deleted file mode 100644 index 663b89e235..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -namespace -{ -Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, MemoryType memory_type, AuxMemoryInfo memory_info, - DependencyGraph::Id &id) -{ - ARM_COMPUTE_UNUSED(op_graph); - return k_graph.add_kernel_tensor(op_tensor.desc, memory_type, memory_info, id, op_tensor.id); -} - -Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, DependencyGraph::Id &id) -{ - // For a tensor t - // 1. If t is a src tensor of the entire op graph, then it's Core. - // (Optimisation opportunity, if we guanrantee that all translate methods are called in topological order, we can always assign t to Core. - // Because even if the op is non-root (which would mean t should be an Aux tensor), the src tensors would be already be determined by the ancestor ops (topological order), and thus would not be overriden by it) - // 2. If t is a dst tensor of the entire op graph, then it's Core. - // 3. Aux tensor with Persistent and Prepare lifetime is manually specified - // 4. All other ts not captured by the above are assigned Aux, with lifetime of Temporary. - // kernel_graph.add_kernel_tensor(input->desc, ); - bool is_src_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.src_tensors()); - bool is_dst_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.dst_tensors()); - MemoryType memory_type; - AuxMemoryInfo memory_info; - if(is_src_tensor_of_graph || is_dst_tensor_of_graph) - { - memory_type = MemoryType::Core; - } - else - { - memory_type = MemoryType::Auxiliary; - memory_info.lifetime = AuxMemoryLifetime::Temporary; - memory_info.size = op_tensor.desc->total_size(); - } - return add_kernel_tensor(k_graph, op_graph, op_tensor, memory_type, memory_info, id); -} - -/** Get the suitable kernel size for using direct convolution method with NHWC data layout. - * - * @note Duplicate of the function with the same name in src/gpu/cl/operators/ClConv2d.cpp - * - * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function - * - * @param[in] gpu_target GPU target - * - * @return the suitable kernel size for using direct convolution method with NHWC data layout - */ -size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) -{ - switch(gpu_target) - { - case arm_compute::GPUTarget::G76: - case arm_compute::GPUTarget::G77: - case arm_compute::GPUTarget::G78: - return 5; - case arm_compute::GPUTarget::G71: - case arm_compute::GPUTarget::G72: - case arm_compute::GPUTarget::MIDGARD: - case arm_compute::GPUTarget::BIFROST: - return 7; - default: - return 5; - } -} -} // namespace - -bool operator==(const OpTensor &t0, const OpTensor &t1) -{ - return std::make_tuple(t0.id()) == std::make_tuple(t1.id()); -} -bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1) -{ - return std::make_tuple(conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.stride, conv2d1.dilation); -} - -bool operator==(const ElementwiseDescriptor &ed0, const ElementwiseDescriptor &ed1) -{ - return ed0.op == ed1.op; // Compare Arithmatic Operations of two ElementwiseDescriptor objects -} - -bool operator==(const FloorDescriptor &, const FloorDescriptor &) -{ - return std::make_tuple() == std::make_tuple(); // Currently two Floor ops are always the same -} - -bool Conv2dContent::operator==(const OperatorContent &other) const -{ - const auto converted = *utils::cast::polymorphic_downcast(&other); - return desc == converted.desc; -} - -bool ElementwiseContent::operator==(const OperatorContent &other) const -{ - const auto converted = *utils::cast::polymorphic_downcast(&other); - return desc == converted.desc; -} - -bool FloorContent::operator==(const OperatorContent &other) const -{ - const auto converted = *utils::cast::polymorphic_downcast(&other); - return desc == converted.desc; -} - -ConvolutionMethod Conv2dContent::select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target) -{ - // Modified from ClConv2d::get_convolution_method - - ARM_COMPUTE_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_ERROR_ON_NULLPTR(weights); - - const PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); - const Size2D dilation = conv2d_desc.dilation; - - const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); - - /* Input spatial dims, kernel size, IFM/OFM, conv info*/ - using ConvolutionConfiguration = std::tuple; - using ConfigurationMethod = std::pair; - - const std::vector known_configs = - { - // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), - // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), - }; - - const auto find_config = [&](ConfigurationMethod c) - { - const ConvolutionConfiguration config = c.first; - const PadStrideInfo info = std::get<3>(config); - const DataLayout data_layout = std::get<4>(config); - - return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == legacy_pad_stride.pad_top() && info.pad_right() == legacy_pad_stride.pad_right() - && info.pad_bottom() == legacy_pad_stride.pad_bottom() && info.pad_left() == legacy_pad_stride.pad_left() && info.stride() == legacy_pad_stride.stride() && (data_layout == src->data_layout()); - }; - - std::vector::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) - { - return (*found).second; - } - - if(dilation != Size2D(1U, 1U)) - { - return ConvolutionMethod::GEMM; - } - else - { - if(src->data_layout() == DataLayout::NCHW) - { - ARM_COMPUTE_ERROR("NCHW not supported"); - } - else - { - const bool is_direct_valid = bool(ClDirectConv2dKernel::validate(src, weights, nullptr, dst, ClDirectConv2dKernelDescriptor{ conv2d_desc })); - const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); - - // SRGAN case - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv2d_desc.pad.top < 3) - && is_direct_valid) - { - return ConvolutionMethod::DIRECT; - } - - // Floating-point case: GeMM/Direct - if(is_data_type_float(src->data_type())) - { - // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride); - const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); - const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; - const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; - const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; - const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); - - // Direct convolution case - if(is_direct_valid) - { - if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || gpu_target == arm_compute::GPUTarget::MIDGARD)) - { - if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) - { - return ConvolutionMethod::DIRECT; - } - } - else - { - if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) - { - return ConvolutionMethod::DIRECT; - } - } - } - - // Default case - return ConvolutionMethod::GEMM; - } - - // Generic case for quantized. Only GeMM - return ConvolutionMethod::GEMM; - } - } - return ConvolutionMethod::DIRECT; -} - -Status Conv2dContent::translate(ClKernelGraph &kernel_graph) const -{ - const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - const auto method = forced_method_enabled ? forced_method : Conv2dContent::select_conv_method(input->desc, weight->desc, dst->desc, desc, CLScheduler::get().target()); - switch(method) - { - case ConvolutionMethod::DIRECT: - { - return translate_direct_conv2d(kernel_graph); - } - default: - { - ARM_COMPUTE_RETURN_ERROR_MSG("Not implemented"); - } - } - return Status{}; -} -Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const -{ - const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1); - const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst); - - ITensorDescPack tensors; - - DependencyGraph::Id input_id; - auto st = add_kernel_tensor(kernel_graph, *_graph, *input, input_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(input_id)); - - DependencyGraph::Id weight_id; - st = add_kernel_tensor(kernel_graph, *_graph, *weight, weight_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(weight_id)); - - if(bias != nullptr) - { - DependencyGraph::Id bias_id; - st = add_kernel_tensor(kernel_graph, *_graph, *bias, bias_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_SRC_2, kernel_graph.get_tensor(bias_id)); - } - - DependencyGraph::Id dst_id; - st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); - - DependencyGraph::Id direct_conv2d_id; - const auto kernel_desc = ClDirectConv2dKernelDescriptor{ desc }; - - st = ClDirectConv2dKernel::validate(input->desc, weight->desc, bias == nullptr ? nullptr : bias->desc, dst->desc, kernel_desc); - ARM_COMPUTE_RETURN_ON_ERROR(st); - - ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; - st = kernel_graph.add_kernel(config, kernel_desc, tensors, direct_conv2d_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - ARM_COMPUTE_UNUSED(direct_conv2d_id); - - return Status{}; -} - -Status ElementwiseContent::translate(ClKernelGraph &kernel_graph) const -{ - const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); - - ITensorDescPack tensors; - - DependencyGraph::Id lhs_id; - auto st = add_kernel_tensor(kernel_graph, *_graph, *lhs, lhs_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(lhs_id)); - - DependencyGraph::Id rhs_id; - st = add_kernel_tensor(kernel_graph, *_graph, *rhs, rhs_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(rhs_id)); - - DependencyGraph::Id dst_id; - st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); - - DependencyGraph::Id add_id; - ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; - - st = ClElementwiseKernel::validate(lhs->desc, rhs->desc, dst->desc); - ARM_COMPUTE_RETURN_ON_ERROR(st); - - st = kernel_graph.add_kernel(config, ClElementwiseKernelDescriptor{ desc }, tensors, add_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - ARM_COMPUTE_UNUSED(add_id); - - return Status{}; -} - -Status FloorContent::translate(ClKernelGraph &kernel_graph) const -{ - const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0); - const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0); - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - ITensorDescPack tensors; - - DependencyGraph::Id src_id; - auto st = add_kernel_tensor(kernel_graph, *_graph, *src, src_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(src_id)); - - DependencyGraph::Id dst_id; - st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id)); - - DependencyGraph::Id add_id; - ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect }; - - st = ClFloorKernel::validate(src->desc, dst->desc); - ARM_COMPUTE_RETURN_ON_ERROR(st); - - st = kernel_graph.add_kernel(config, ClFloorKernelDescriptor{ desc }, tensors, add_id); - ARM_COMPUTE_RETURN_ON_ERROR(st); - - return Status{}; -} - -std::vector traverse(const OperatorGraph::Implementation &graph) -{ - std::vector ops; - const auto sorted = graph.graph.topological_sort(); - for(const auto &pack : sorted.second) - { - ops.push_back(graph.operators.at(pack.op).get()); - } - return ops; -} - -std::vector traverse(OperatorGraph::Implementation &graph) -{ - std::vector ops; - const auto sorted = graph.graph.topological_sort(); - for(const auto &pack : sorted.second) - { - ops.push_back(graph.operators.at(pack.op).get()); - } - return ops; -} - -Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph) -{ - for(const auto &op : traverse(op_graph)) - { - const auto st = op->translate(kernel_graph); - ARM_COMPUTE_RETURN_ON_ERROR(st); - } - return Status{}; -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h deleted file mode 100644 index b303cdb9fc..0000000000 --- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL - -#include "arm_compute/core/experimental/ClWorkload.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h" - -#include "support/Cast.h" -#include "support/DeepCopy.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -enum class OperatorComplexity -{ - Complex = 0, - Simple -}; - -struct ClKernelGraph; -struct OpTensorContent -{ -public: - using Id = DependencyGraph::Id; - OpTensorContent() = default; - OpTensorContent(Id id) - : id{ id }, desc{} - { - } - OpTensorContent(Id id, ITensorInfo *desc) - : id{ id }, desc{ desc } - { - } - ~OpTensorContent() = default; - OpTensorContent(const OpTensorContent &) = default; - OpTensorContent &operator=(const OpTensorContent &) = default; - OpTensorContent(OpTensorContent &&) = default; - OpTensorContent &operator=(OpTensorContent &&) = default; - bool operator==(const OpTensorContent &other) const - { - return desc == other.desc; - } - - const ITensorInfo *get_tensor_info() const - { - return desc; - } - ITensorInfo *get_tensor_info() - { - return desc; - } - - Id id{}; - ITensorInfo *desc{}; -}; - -struct OperatorContent -{ -public: - using Id = DependencyGraph::Id; - OperatorContent() = default; - OperatorContent(const OperatorGraph::Implementation *graph, Id id, const ITensorDescPack &tensors) - : _graph{ graph }, _id{ id }, _tensors{ tensors } - { - } - OperatorContent(const OperatorContent &op) = default; - OperatorContent &operator=(const OperatorContent &op) = default; - OperatorContent(OperatorContent &&op) = default; - OperatorContent &operator=(OperatorContent &&op) = default; - virtual ~OperatorContent() = default; - virtual OperatorComplexity complexity() const = 0; - virtual bool operator==(const OperatorContent &other) const = 0; - virtual Status translate(ClKernelGraph &kernel_graph) const = 0; - -protected: - const OperatorGraph::Implementation *_graph {}; - Id _id{}; - ITensorDescPack _tensors{}; -}; - -struct Conv2dContent : public OperatorContent -{ -public: - Conv2dContent() = default; - Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack &tensors) - : OperatorContent(graph, id, tensors), desc(desc), forced_method(), forced_method_enabled(false) - { - } - // Temporary. Do not need to pass ConvolutionMethod - Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack &tensors, ConvolutionMethod method) - : OperatorContent(graph, id, tensors), desc(desc), forced_method(method), forced_method_enabled(true) - { - } - ~Conv2dContent() = default; - Conv2dContent(const Conv2dContent &) = default; - Conv2dContent &operator=(const Conv2dContent &) = default; - Conv2dContent(Conv2dContent &&) = default; - Conv2dContent &operator=(Conv2dContent &&) = default; - bool operator==(const OperatorContent &other) const override; - OperatorComplexity complexity() const override - { - return OperatorComplexity::Complex; - } - void set_method(ConvolutionMethod method) - { - forced_method_enabled = true; - forced_method = method; - } - - Status translate(ClKernelGraph &kernel_graph) const override; - /** Replicate heuristics of @ref ClConv2d::get_convolution_method(), except that non-supported data types and data layouts are removed from the heuristics - * - * @param src - * @param weights - * @param dst - * @param conv2d_desc - * @param gpu_target - * @return ConvolutionMethod - */ - static ConvolutionMethod select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target); - - Conv2dDescriptor desc{}; - ConvolutionMethod forced_method{ ConvolutionMethod::GEMM_CONV2D }; - bool forced_method_enabled{ false }; - -private: - Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const; -}; - -class ElementwiseContent : public OperatorContent -{ -public: - ElementwiseContent() = default; - ElementwiseContent(const OperatorGraph::Implementation *graph, Id id, const ElementwiseDescriptor &desc, const ITensorDescPack &tensors) - : OperatorContent(graph, id, tensors), desc(desc) - { - } - ~ElementwiseContent() = default; - ElementwiseContent(const ElementwiseContent &) = default; - ElementwiseContent &operator=(const ElementwiseContent &) = default; - ElementwiseContent(ElementwiseContent &&) = default; - ElementwiseContent &operator=(ElementwiseContent &&) = default; - bool operator==(const OperatorContent &other) const override; - OperatorComplexity complexity() const override - { - return OperatorComplexity::Simple; - } - Status translate(ClKernelGraph &kernel_graph) const override; - -private: - ElementwiseDescriptor desc{}; -}; - -class FloorContent : public OperatorContent -{ -public: - FloorContent() = default; - FloorContent(const OperatorGraph::Implementation *graph, Id id, const FloorDescriptor &desc, const ITensorDescPack &tensors) - : OperatorContent(graph, id, tensors), desc(desc) - { - } - ~FloorContent() = default; - FloorContent(const FloorContent &) = default; - FloorContent &operator=(const FloorContent &) = default; - FloorContent(FloorContent &&) = default; - FloorContent &operator=(FloorContent &&) = default; - bool operator==(const OperatorContent &other) const override; - OperatorComplexity complexity() const override - { - return OperatorComplexity::Simple; - } - Status translate(ClKernelGraph &kernel_graph) const override; - -private: - FloorDescriptor desc{}; -}; - -struct OperatorGraph::Implementation -{ -public: - template - void add_node(Operator::Id id, Args &&... args) - { - operators[id] = utils::memory::make_deep_unique(this, id, std::forward(args)...); - } - - template - void add_tensor(OpTensor::Id id, Args &&... args) - { - tensors[id] = utils::memory::make_deep_unique(id, std::forward(args)...); - } - - using Dependency = DependencyGraph; - using OperatorMap = std::map>; - using OpTensorMap = std::map>; - - Implementation() = default; - ~Implementation() = default; - - friend bool operator==(const OperatorGraph::Implementation &graph0, const OperatorGraph::Implementation &graph1) - { - return graph0.graph == graph1.graph && graph0.operators == graph1.operators && graph0.tensors == graph1.tensors; - } - - Dependency graph{}; - OperatorMap operators{}; - OpTensorMap tensors{}; - Status status{}; -}; - -std::vector traverse(const OperatorGraph::Implementation &graph); - -std::vector traverse(OperatorGraph::Implementation &graph); - -Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph); - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute - -#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp deleted file mode 100644 index 30e19d5907..0000000000 --- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" - -#include "arm_compute/core/CL/ICLTensor.h" -#include "src/core/CL/CLUtils.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" -#include "src/gpu/cl/ClKernelLibrary.h" - -#include "support/Cast.h" -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -using namespace arm_compute::opencl; - -void ClCompositeKernel::configure(const ClCompileContext &compile_ctx, const ClKernelCode &cl_code) -{ - // Create kernel from kernel source string - opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); - _kernel = static_cast(compile_ctx.create_kernel(cl_code.name, - "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */, - cl_code.code, - klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */, - cl_code.build_options.options(), - false /* Is source binary */)); - - // Configure execution window - IClKernel::configure_internal(cl_code.window); - - // Set config id for lws tuning - _config_id = cl_code.config_id; - - // Set kernel arguments - _arguments = cl_code.arguments; -} - -inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images) -{ - switch(arg.tensor_arg_type) - { - case ClKernelTensorArgType::Scalar: - { - ARM_COMPUTE_ERROR("Unsupported yet"); - break; - } - - case ClKernelTensorArgType::Vector: - { - add_1D_tensor_argument(idx, tensor, arg_slice); - break; - } - - case ClKernelTensorArgType::Image: - { - add_2D_tensor_argument(idx, tensor, arg_slice); - break; - } - case ClKernelTensorArgType::Image_Reinterpret_As_3D: - { - add_2D_tensor_argument(idx, tensor, arg_slice); - const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom; - _kernel.setArg(idx++, static_cast(total_cross_plane_pad)); - break; - } - case ClKernelTensorArgType::Image_Export_To_ClImage2D: - { - const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); - const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); - cl_images.push_back(tensor_image2d); - _kernel.setArg(idx++, tensor_image2d); - break; - } - - case ClKernelTensorArgType::Image_3D: - { - add_2D_tensor_argument(idx, tensor, arg_slice); - _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); - break; - } - case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D: - { - const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); - const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); - cl_images.push_back(tensor_image2d); - _kernel.setArg(idx++, tensor_image2d); - _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); - break; - } - - case ClKernelTensorArgType::Tensor_3D: - { - add_3D_tensor_argument(idx, tensor, arg_slice); - break; - } - - case ClKernelTensorArgType::Tensor_4D: - { - add_4D_tensor_argument(idx, tensor, arg_slice); - break; - } - case ClKernelTensorArgType::Tensor_4D_t_Buffer: - { - add_4d_tensor_nhwc_argument(idx, tensor); - break; - } - case ClKernelTensorArgType::Tensor_4D_t_Image: - { - const size_t image_w = tensor->info()->dimension(0) / 4; - const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1); - const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; - - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), - TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y); - cl_images.push_back(tensor_image2d); - - _kernel.setArg(idx++, tensor_image2d); - add_4d_tensor_nhwc_argument(idx, tensor); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported"); - } - } -} - -void ClCompositeKernel::run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) -{ - ARM_COMPUTE_UNUSED(exec_desc); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - Window slice = window.first_slice_window_3D(); - // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the matrix multiplication is used to perform a convolution operation - Window slice_fixed_z = slice; - slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1)); - slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1)); - - unsigned int idx = 0; - do - { - // Set kernel arguments - Window arg_slice = slice; - // CLImages created from tensor arguments. Need to be retained until enqueue - std::vector cl_images; - for(auto id_arg : _arguments) - { - const auto arg = id_arg.second; - auto tensor = utils::cast::polymorphic_downcast(tensors.get_tensor(arg.arg_id)); - ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); - ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info()); - if(!arg.slide_along_dimz) - { - // The stride_z for matrix must be zero if we do not slice - ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0); - arg_slice = slice_fixed_z; - } - add_tensor_argument(idx, arg, tensor, arg_slice, cl_images); - } - - // Dispatch kernel - bool use_dummy_work_items = false; - enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items); - } - while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice)); -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h deleted file mode 100644 index 52b92be568..0000000000 --- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H -#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H - -#include "arm_compute/core/experimental/ClWorkload.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -struct ClExecutionDescriptor; -struct ClKernelCode; - -class ClCompositeKernel final : public opencl::IClKernel -{ -public: - void configure(const opencl::ClCompileContext &, const ClKernelCode &); - - /** Run the composite kernel - * @note The slots / keys in ITensorPack are the argument Ids of the tensors in blueprint - * - * @param tensors ITensorPack object containing run-time tensor memories - * @param window Execution window - * @param queue OpenCL Command queue - * @param exec_desc Descriptor containing execution information - */ - virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override; - -private: - /** Set a kernel tensor argument - * - * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. - * @param[in] arg Kernel argument descriptor accompanying @p tensor - * @param[in] tensor Tensor to set as an argument of the object's kernel. - * @param[in] arg_slice Window the kernel will be run on. - * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) - */ - inline void add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images); - -private: - ClKernelArgList _arguments{}; /** All kernel arguments required by runtime */ -}; - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp b/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp deleted file mode 100644 index a53a73e4ec..0000000000 --- a/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/runtime/experimental/ClCompositeOperator.h" - -#include "arm_compute/core/experimental/ClWorkload.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" -#include "support/Cast.h" - -namespace arm_compute -{ -namespace experimental -{ -namespace dynamic_fusion -{ -namespace -{ -Status add_tensor_to_tensor_pack(int wk_tensor_id, ICLTensor *tensor, const ClWorkload &workload, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map) -{ - if(tensor == nullptr) - { - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs"); - } - const auto bp_tensor_id = workload.tensors.at(wk_tensor_id).kernel_arg.arg_id; // blueprint tensor id - std::vector uwk_ids{}; - const auto src_uwk_ids = workload.graph.src_ops_from_tensor(wk_tensor_id); - const auto dst_uwk_ids = workload.graph.dst_ops_from_tensor(wk_tensor_id); - uwk_ids.insert(uwk_ids.end(), src_uwk_ids.begin(), src_uwk_ids.end()); - uwk_ids.insert(uwk_ids.end(), dst_uwk_ids.begin(), dst_uwk_ids.end()); - - for(auto uwk_id : uwk_ids) - { - TensorPackMap *pack_map = nullptr; - const auto uwk_stage = workload.unit_workloads.at(uwk_id).stage.stage; - switch(uwk_stage) - { - case UnitWorkloadStage::Stage::Run: - pack_map = &run_pack_map; - break; - case UnitWorkloadStage::Stage::Prepare: - pack_map = &prepare_pack_map; - break; - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported workload stage"); - } - - ITensorPack *tensor_pack = pack_map->find_tensor_pack(uwk_id); - if(tensor_pack == nullptr) - { - pack_map->add_tensor_pack(uwk_id, ITensorPack{ { bp_tensor_id, tensor } }); - } - else - { - tensor_pack->add_tensor(bp_tensor_id, tensor); - } - } - return Status{}; -} - -} // namespace - -ITensorPack *TensorPackMap::find_tensor_pack(UnitWorkload::Id uwk_id) -{ - auto tensor_pack = _tensor_packs.find(uwk_id); - if(tensor_pack != _tensor_packs.end()) - { - return &(tensor_pack->second); - } - return nullptr; -} - -ITensorPack &TensorPackMap::get_tensor_pack(UnitWorkload::Id uwk_id) -{ - return _tensor_packs.at(uwk_id); -} - -void TensorPackMap::add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack) -{ - _tensor_packs[uwk_id] = tensor_pack; -} - -Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors) -{ - for(auto tensor : workload.tensors) - { - const auto wk_tensor_id = tensor.first; // workload tensor id - ICLTensor *tensor_object = nullptr; - if(tensor.second.memory_type == MemoryType::Core) - { - const auto op_tensor_id = workload.op_tensor_id_lut.at(wk_tensor_id); - auto op_tensor_find = op_tensors.find(op_tensor_id); - if(op_tensor_find == op_tensors.end()) - { - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Cannot find binding for some operator tensor"); - } - tensor_object = utils::cast::polymorphic_downcast(op_tensor_find->second); - } - else if(tensor.second.memory_type == MemoryType::Auxiliary) - { - // Create aux tensor CLTensor object - const TensorInfo tensor_info = *tensor.second.info; - const auto memory_info = tensor.second.memory_info; - tensor_object = aux_tensor_data.add_aux_tensor(wk_tensor_id, tensor_info, memory_info); - } - else - { - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported tensor memory type"); - } - - const auto st = add_tensor_to_tensor_pack(wk_tensor_id, tensor_object, workload, prepare_pack_map, run_pack_map); - ARM_COMPUTE_RETURN_ON_ERROR(st); - } - return Status{}; -} - -CLTensor *ClAuxTensorData::add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info) -{ - auto find_tensor_pair = _owned_tensors.find(tensor_id); - if(find_tensor_pair == _owned_tensors.end()) - { - return find_tensor_pair->second.get(); - } - else - { - auto tensor = std::make_unique(); - auto inserted_pair = _owned_tensors.emplace(tensor_id, std::move(tensor)).first; - auto new_tensor = inserted_pair->second.get(); - _tensors.emplace_back(new_tensor, tensor_info, memory_info); - return new_tensor; - } -} - -std::vector &ClAuxTensorData::get_tensors() -{ - return _tensors; -} -struct ClCompositeOperator::Implementation -{ - std::map> _kernels{}; - std::map> _kernels_prep{}; - ClWorkload _workload{}; - bool _is_prepared{ false }; -}; - -ClCompositeOperator::ClCompositeOperator() - : _impl{ std::make_unique() } -{ -} - -ClCompositeOperator::~ClCompositeOperator() = default; - -void ClCompositeOperator::configure(const CLCompileContext &ctx, const ClWorkload &workload) -{ - ARM_COMPUTE_ERROR_THROW_ON(ClCompositeOperator::validate(workload)); - _impl->_workload = workload; - - // Traverse workloads in topological order - const auto sorted = workload.graph.topological_sort().second; - for(const auto &node : sorted) - { - auto work = workload.unit_workloads.at(node.op); - auto stage = work.stage.stage; - auto k = std::make_unique(); - k->configure(ctx, work.code); - - switch(stage) - { - case UnitWorkloadStage::Stage::Run: - _impl->_kernels.emplace(work.id, std::move(k)); - break; - case UnitWorkloadStage::Stage::Prepare: - _impl->_kernels_prep.emplace(work.id, std::move(k)); - break; - default: - ARM_COMPUTE_ERROR("Invalid stage"); - } - break; - } -} - -Status ClCompositeOperator::validate(const ClWorkload &workload) -{ - return workload.status; -} - -void ClCompositeOperator::prepare(TensorPackMap &tensor_pack_map) -{ - if(!_impl->_is_prepared) - { - for(auto &id_kernel_pair : _impl->_kernels_prep) - { - const bool flush_queue = false; - const auto uwk_id = id_kernel_pair.first; - auto kernel = id_kernel_pair.second.get(); - CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue); - } - - _impl->_is_prepared = true; - } -} - -void ClCompositeOperator::run(TensorPackMap &tensor_pack_map) -{ - ARM_COMPUTE_ERROR_ON_MSG(!_impl->_is_prepared, "Operator is not prepared"); - - for(auto &id_kernel_pair : _impl->_kernels) - { - // Flush the command queue on the last kernel - const bool flush_queue = false; - const auto uwk_id = id_kernel_pair.first; - auto kernel = id_kernel_pair.second.get(); - CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue); - } -} - -} // namespace dynamic_fusion -} // namespace experimental -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp index 8d30c05361..49fb724cdb 100644 --- a/src/runtime/CL/CLScheduler.cpp +++ b/src/runtime/CL/CLScheduler.cpp @@ -27,10 +27,6 @@ #include "arm_compute/runtime/CL/CLTuner.h" #include "src/core/CL/ICLKernel.h" -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - namespace arm_compute { cl::Context &CLScheduler::context() @@ -190,34 +186,6 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f flush_queue(flush); } -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - -void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush) -{ - ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised, - "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \ - or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!"); - - // ClCompositeKernel is stateless thus alway requires memory injection - - // Tune the kernel if the CLTuner has been provided - if(_cl_tuner != nullptr) - { - _cl_tuner->tune_kernel_dynamic(kernel, tensors, exec_desc); - } - - // Run kernel - kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc); - if(_job_chaining_enabled) - { - ++_job_chaining_count; - } - - flush_queue(flush); -} - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void CLScheduler::flush_queue(bool flush) { if(_job_chaining_enabled) @@ -245,15 +213,6 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush enqueue_common(kernel, tensors, flush); } -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - -void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush) -{ - enqueue_common(kernel, tensors, exec_desc, flush); -} - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void CLScheduler::enable_job_chaining(int job_chaining_size) { _job_chaining_enabled = true; diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp index 8ce5177847..1cc20f0c1e 100644 --- a/src/runtime/CL/CLTuner.cpp +++ b/src/runtime/CL/CLTuner.cpp @@ -28,9 +28,6 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/ICLKernel.h" #include "support/StringSupport.h" -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) #include #include @@ -65,26 +62,6 @@ private: ITensorPack &_tensors; }; -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -struct CompositeKernelData : public CLTuner::IKernelData -{ - CompositeKernelData(ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) - : _tensors{ tensors }, _exec_desc{ exec_desc } - { - } - ~CompositeKernelData() override = default; - void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override - { - // ClCompositeKernel is purely stateless, and thus always requires memory injection - kernel.run_composite_op(_tensors, kernel.window(), queue, _exec_desc); - } - -private: - ITensorPack &_tensors; - const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc; -}; -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - bool CLTuner::kernel_event_is_set() const { return _kernel_event() != nullptr; @@ -165,15 +142,6 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) do_tune_kernel_dynamic(kernel, &data); } -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) -void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) -{ - CompositeKernelData data{ tensors, exec_desc }; - - do_tune_kernel_dynamic(kernel, &data); -} -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) - void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params) { _tuning_params_table.emplace(kernel_id, optimal_tuning_params); diff --git a/tests/SConscript b/tests/SConscript index 87b654385a..8596cfa042 100644 --- a/tests/SConscript +++ b/tests/SConscript @@ -120,7 +120,6 @@ files_validation += Glob('validation/CPP/' + filter_pattern) if env['opencl']: if env['experimental_dynamic_fusion']: - test_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION']) files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern) files_validation += Glob('validation/dynamic_fusion/gpu/cl/' + filter_pattern) diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp deleted file mode 100644 index 1b1e8aa761..0000000000 --- a/tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp +++ /dev/null @@ -1,394 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" -#include "src/core/utils/helpers/float_ops.h" -#include "tests/CL/CLAccessor.h" -#include "tests/framework/Macros.h" -#include "tests/validation/Validation.h" -#include "tests/validation/reference/ConvolutionLayer.h" -#include "tests/validation/reference/ElementwiseOperations.h" -#include "tests/validation/reference/Permute.h" - -#include "arm_compute/runtime/experimental/ClCompositeOperator.h" -#include "tests/validation/reference/Floor.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" - -using namespace arm_compute::experimental::dynamic_fusion; -using namespace arm_compute::test::validation::utils; - -namespace arm_compute -{ -namespace test -{ -namespace validation -{ -TEST_SUITE(CL) -TEST_SUITE(UNIT) -TEST_SUITE(DYNAMIC_FUSION) -TEST_SUITE(ArbitraryFusion) - -TEST_CASE(ElementwiseBroadcasting, framework::DatasetMode::ALL) -{ - // Test elementwise broadcasting - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - - const auto input_shape = TensorShape(7, 9, 5); - const auto rhs_shape = TensorShape(7, 1, 1); - const auto dst_shape = TensorShape(7, 9, 5); - - // Tensor Info - auto input_info = TensorInfo(input_shape, 1, data_type, data_layout); - auto addend_info = TensorInfo(rhs_shape, 1, data_type, data_layout); - auto dst_info = TensorInfo(); - - ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD }; - - CLScheduler::get().default_reinit(); - const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); - OperatorGraph op_graph; - - const auto op_input = add_tensor(op_graph, input_info); - const auto op_addend = add_tensor(op_graph, addend_info); - const auto op_dst = add_tensor(op_graph, dst_info); - - add_op_elementwise_op(op_graph, add_desc, op_input, op_addend, op_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - build(workload, op_graph, workload_ctx); - - ClCompositeOperator op; - op.configure(cl_compile_ctx, workload); - - // Construct tensors - CLTensor t_input{}; - CLTensor t_addend{}; - CLTensor t_dst{}; - - // Init tensors - t_input.allocator()->init(input_info); - t_addend.allocator()->init(addend_info); - t_dst.allocator()->init(dst_info); - - // Allocate and fill tensors - t_input.allocator()->allocate(); - t_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - - // Fill - fill(CLAccessor(t_input), 0, library.get()); - fill(CLAccessor(t_addend), 1, library.get()); - - // Pack tensors - OpTensorBinding bp_tensors({ { op_input, &t_input }, - { op_addend, &t_addend }, - { op_dst, &t_dst } - }); - - // Populate prepare and run pack-maps (including allocating aux tensors) - ClAuxTensorData aux_tensor_data{}; - TensorPackMap prepare_pack_map{}; - TensorPackMap run_pack_map{}; - bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); - - op.prepare(prepare_pack_map); - op.run(run_pack_map); - - // Create reference - SimpleTensor ref_input{ input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_addend{ rhs_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - - // Fill reference - fill(ref_input, 0, library.get()); - fill(ref_addend, 1, library.get()); - - auto ref_input_nchw = reference::permute(ref_input, PermutationVector(1U, 2U, 0U)); - auto ref_addend_nchw = reference::permute(ref_addend, PermutationVector(1U, 2U, 0U)); - - auto dst_shape_nchw = dst_shape; - permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U)); - - auto ref_t_dst_nchw = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_input_nchw, - ref_addend_nchw, - data_type, - ConvertPolicy{}); - - const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U)); - - RelativeTolerance tolerance_f32(0.001f); - validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); -} -TEST_CASE(DivFloor, framework::DatasetMode::ALL) -{ - // x = floor(div(input, input2)) - const auto data_type = DataType::F32; - const auto eltwise_info = ElementwiseDescriptor{ ArithmeticOperation::DIV }; - - // Tensor Values - const auto width = 7U; - const auto height = 6U; - - // Shapes - const auto input1_shape = TensorShape(width, height); - const auto input2_shape = TensorShape(width, height); - const auto dst_shape = TensorShape(width, height); - - // Create reference - SimpleTensor ref_src_nhwc{ input1_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_src2_nhwc{ input2_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - - // Fill reference - fill(ref_src_nhwc, 0, library.get()); - fill(ref_src2_nhwc, 1, library.get()); - - auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U)); - auto ref_src2 = reference::permute(ref_src2_nhwc, PermutationVector(1U, 2U, 0U)); - - TensorShape dst_shape_nchw{ dst_shape }; - permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U)); - - const auto ref_dst_nchw = reference::floor_layer(reference::arithmetic_operation( - ArithmeticOperation::DIV, - ref_src, - ref_src2, - data_type, - ConvertPolicy::SATURATE)); - - const auto ref_t_dst = reference::permute(ref_dst_nchw, PermutationVector(2U, 0U, 1U)); - - // Tensor Info - auto input1_info = TensorInfo(input1_shape, 1, data_type, DataLayout::NHWC); - auto input2_info = TensorInfo(input2_shape, 1, data_type, DataLayout::NHWC); - auto dst_info = TensorInfo(); - auto acc_info = TensorInfo(); // Intermediate tensor for division - - // Initialise Scheduler - CLScheduler::get().default_reinit(); - const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); - OperatorGraph op_graph; - - // add tensors - auto op_input1 = add_tensor(op_graph, input1_info); - auto op_input2 = add_tensor(op_graph, input2_info); - auto op_acc = add_tensor(op_graph, acc_info); - auto op_dst = add_tensor(op_graph, dst_info); - - add_op_elementwise_op(op_graph, eltwise_info, op_input1, op_input2, op_acc); - add_op_floor(op_graph, FloorDescriptor(), op_acc, op_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - build(workload, op_graph, workload_ctx); - - ClCompositeOperator op; - op.configure(cl_compile_ctx, workload); - - // Configure and add tensors. - CLTensor t_input1{}; - CLTensor t_input2{}; - CLTensor t_dst{}; - - // Init Tensors - t_input1.allocator()->init(input1_info); - t_input2.allocator()->init(input2_info); - t_dst.allocator()->init(dst_info); - - // Allocate and fill tensors - t_input1.allocator()->allocate(); - t_input2.allocator()->allocate(); - t_dst.allocator()->allocate(); - - fill(CLAccessor(t_input1), 0, library.get()); - fill(CLAccessor(t_input2), 1, library.get()); - - // "Pack" tensors - OpTensorBinding bp_tensors({ { op_input1, &t_input1 }, - { op_input2, &t_input2 }, - { op_dst, &t_dst } - }); - - // Populate prepare and run pack-maps (including allocating aux tensors) - ClAuxTensorData aux_tensor_data{}; - TensorPackMap prepare_pack_map{}; - TensorPackMap run_pack_map{}; - bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); - - op.prepare(prepare_pack_map); - op.run(run_pack_map); - - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(t_dst), ref_dst_nchw, tolerance_f32); -} -TEST_CASE(Dconv2dAddDiv, framework::DatasetMode::ALL) -{ - // output = div(divend, add(addend, conv2d1x1(direct_conv)(input, weights, bias))) - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - - const auto input_shape = TensorShape(384, 12, 12); - const auto weight_shape = TensorShape(384, 1, 1, 16); - const auto dst_shape = TensorShape(16, 12, 12); - - // Tensor Info - auto input_info = TensorInfo(input_shape, 1, data_type, data_layout); - auto weight_info = TensorInfo(weight_shape, 1, data_type, data_layout); - auto addend_info = TensorInfo(dst_shape, 1, data_type, data_layout); - auto divend_info = TensorInfo(dst_shape, 1, data_type, data_layout); - auto acc_info = TensorInfo(); // Intermediate tensor for conv - auto acc_1_info = TensorInfo(); - auto dst_info = TensorInfo(); - - Conv2dDescriptor conv2d_desc{}; - ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD }; - ElementwiseDescriptor div_desc{ ArithmeticOperation::DIV }; - - CLScheduler::get().default_reinit(); - const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); - OperatorGraph op_graph; - - const auto op_input = add_tensor(op_graph, input_info); - const auto op_weight = add_tensor(op_graph, weight_info); - const auto op_addend = add_tensor(op_graph, addend_info); - const auto op_divend = add_tensor(op_graph, divend_info); - const auto op_acc = add_tensor(op_graph, acc_info); // temp accumulator; TensorInfo to be inferred - const auto op_acc_1 = add_tensor(op_graph, acc_1_info); // temp accumulator; TensorInfo to be inferred - const auto op_dst = add_tensor(op_graph, dst_info); - - auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_input, op_weight, op_acc); - force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); - add_op_elementwise_op(op_graph, add_desc, op_acc, op_addend, op_acc_1); - add_op_elementwise_op(op_graph, div_desc, op_acc_1, op_divend, op_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - build(workload, op_graph, workload_ctx); - - ClCompositeOperator op; - op.configure(cl_compile_ctx, workload); - - // Construct tensors - CLTensor t_input{}; - CLTensor t_weight{}; - CLTensor t_addend{}; - CLTensor t_divend{}; - CLTensor t_dst{}; - - // Init tensors - t_input.allocator()->init(input_info); - t_weight.allocator()->init(weight_info); - t_divend.allocator()->init(divend_info); - t_addend.allocator()->init(addend_info); - t_dst.allocator()->init(dst_info); - - // Allocate and fill tensors - t_input.allocator()->allocate(); - t_weight.allocator()->allocate(); - t_divend.allocator()->allocate(); - t_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - - // Fill - fill(CLAccessor(t_input), 0, library.get()); - fill(CLAccessor(t_weight), 1, library.get()); - fill(CLAccessor(t_addend), 2, library.get()); - fill(CLAccessor(t_divend), 3, library.get()); - - // Pack tensors - OpTensorBinding bp_tensors({ { op_input, &t_input }, - { op_weight, &t_weight }, - { op_addend, &t_addend }, - { op_divend, &t_divend }, - { op_dst, &t_dst } - }); - - // Populate prepare and run pack-maps (including allocating aux tensors) - ClAuxTensorData aux_tensor_data{}; - TensorPackMap prepare_pack_map{}; - TensorPackMap run_pack_map{}; - bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); - - op.prepare(prepare_pack_map); - op.run(run_pack_map); - - // Create reference - SimpleTensor ref_input{ input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_weight{ weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_bias_placeholder{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_addend{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_divend{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - - // Fill reference - fill(ref_input, 0, library.get()); - fill(ref_weight, 1, library.get()); - fill(ref_addend, 2, library.get()); - fill(ref_divend, 3, library.get()); - - auto ref_input_nchw = reference::permute(ref_input, PermutationVector(1U, 2U, 0U)); - auto ref_weight_nchw = reference::permute(ref_weight, PermutationVector(1U, 2U, 0U)); - auto ref_bias_placeholder_nchw = reference::permute(ref_bias_placeholder, PermutationVector(1U, 2U, 0U)); - auto ref_addend_nchw = reference::permute(ref_addend, PermutationVector(1U, 2U, 0U)); - auto ref_divend_nchw = reference::permute(ref_divend, PermutationVector(1U, 2U, 0U)); - - auto dst_shape_nchw = dst_shape; - permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U)); - - PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); - auto ref_acc_nchw = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_addend_nchw, - reference::convolution_layer(ref_input_nchw, ref_weight_nchw, ref_bias_placeholder_nchw, dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation), - data_type, - ConvertPolicy{}); - - auto ref_t_dst_nchw = reference::arithmetic_operation( - ArithmeticOperation::DIV, - ref_acc_nchw, - ref_divend_nchw, - data_type, - ConvertPolicy{}); - - const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U)); - - RelativeTolerance tolerance_f32(0.001f); - validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); -} - -TEST_SUITE_END() // ArbitraryFusion -TEST_SUITE_END() // DYNAMIC_FUSION -TEST_SUITE_END() // UNIT -TEST_SUITE_END() // CL - -} // namespace validation -} // namespace test -} // namespace arm_compute - -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp deleted file mode 100644 index dc98d72f4b..0000000000 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION - -#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" -#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" - -#include "src/core/utils/helpers/float_ops.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" -#include "tests/CL/CLAccessor.h" -#include "tests/framework/Macros.h" -#include "tests/framework/datasets/Datasets.h" -#include "tests/validation/Validation.h" -#include "tests/validation/reference/ConvolutionLayer.h" -#include "tests/validation/reference/ElementwiseOperations.h" -#include "tests/validation/reference/GEMM.h" -#include "tests/validation/reference/Permute.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/AccessWindowStatic.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" - -#include - -using namespace arm_compute::experimental::dynamic_fusion; -using namespace arm_compute::test::validation::utils; - -namespace arm_compute -{ -namespace test -{ -namespace validation -{ -TEST_SUITE(CL) -TEST_SUITE(UNIT) -TEST_SUITE(DYNAMIC_FUSION) -TEST_SUITE(ClCompositeKernel) -TEST_SUITE(Validate) - -TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) -{ - /* Computation: - * out = add(addend, direct_conv2d(lhs, rhs, bias)) (non-broadcast) - */ - - ClCompositeKernel kernel{}; - ClKernelBlueprint bp{}; - ClKernelCode cl_code{}; - ClExecutionDescriptor exec_desc{}; - Status st{}; - - const auto data_type = DataType::F32; - const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ }; - const auto eltwise_info = ElementwiseDescriptor{ ArithmeticOperation::ADD }; - - const auto width = 7U; - const auto height = 6U; - const auto IFM = 5U; - const auto OFM = 4U; - const auto kernel_sz = 3U; - - const auto src_shape = TensorShape(IFM, width, height); - const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); - const auto bia_shape = TensorShape(OFM); - const auto addend_shape = TensorShape(1, 1); - const auto dst_shape = TensorShape(OFM, width, height); - - auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); - auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); - auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); - auto addend_info = TensorInfo(addend_shape, 1, data_type, DataLayout::NHWC); - auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); - - const auto n0 = std::min(OFM, 4u); - const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U; - - const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info }; - const ClElementwiseKernelDescriptor eltwise_add_desc{ eltwise_info }; - const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; - - ArgumentID src_id{ g_arg_placeholder }; - ArgumentID wei_id{ g_arg_placeholder }; - ArgumentID bia_id{ g_arg_placeholder }; - ArgumentID acc_id{ g_arg_placeholder }; - ArgumentID acc_1_id{ g_arg_placeholder }; - ArgumentID addend_id{ g_arg_placeholder }; - ArgumentID dst_id{ g_arg_placeholder }; - - st = add_tensor(bp, &src_info, src_id); - st = add_tensor(bp, &wei_info, wei_id); - st = add_tensor(bp, &bia_info, bia_id); - st = add_tensor(bp, &dst_info, acc_id); - st = add_tensor(bp, &dst_info, acc_1_id); - st = add_tensor(bp, &addend_info, addend_id); - st = add_tensor(bp, &dst_info, dst_id); - - st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); - st = add_kcomp_eltwise_op(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id); - st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id); - - exec_desc.skip_sliding_window = true; - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - st = tune_static(exec_desc, cl_code); - - CLScheduler::get().default_reinit(); - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - - // Construct tensors - CLTensor src{}; - CLTensor wei{}; - CLTensor bia{}; - CLTensor addend{}; - CLTensor dst{}; - - // Init tensors - src.allocator()->init(src_info); - wei.allocator()->init(wei_info); - bia.allocator()->init(bia_info); - addend.allocator()->init(dst_info); - dst.allocator()->init(dst_info); - - // "Pack" tensors - ITensorPack tensors{ { src_id, &src }, - { wei_id, &wei }, - { bia_id, &bia }, - { addend_id, &addend }, - { dst_id, &dst } }; - - // Allocate and fill tensors - src.allocator()->allocate(); - wei.allocator()->allocate(); - bia.allocator()->allocate(); - addend.allocator()->allocate(); - dst.allocator()->allocate(); - - fill(CLAccessor(src), 0, library.get()); - fill(CLAccessor(wei), 1, library.get()); - fill(CLAccessor(bia), 2, library.get()); - fill(CLAccessor(addend), 3, library.get()); - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - - // Create reference - SimpleTensor ref_src_nhwc{ src_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_wei_nhwc{ wei_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_bia_nhwc{ bia_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_addend_nhwc{ addend_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - - // Fill reference - fill(ref_src_nhwc, 0, library.get()); - fill(ref_wei_nhwc, 1, library.get()); - fill(ref_bia_nhwc, 2, library.get()); - fill(ref_addend_nhwc, 3, library.get()); - - auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U)); - auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U)); - auto ref_bia = reference::permute(ref_bia_nhwc, PermutationVector(1U, 2U, 0U)); - auto ref_addend = reference::permute(ref_addend_nhwc, PermutationVector(1U, 2U, 0U)); - - TensorShape dst_shape_nchw{ dst_shape }; - permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U)); - - const auto ref_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_addend, - reference::convolution_layer(ref_src, ref_wei, ref_bia, dst_shape_nchw, - PadStrideInfo - { - static_cast(conv_info.stride.x()), - static_cast(conv_info.stride.y()), - static_cast(conv_info.pad.left), - static_cast(conv_info.pad.top) }), - data_type, - ConvertPolicy::SATURATE); - - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(dst), ref_dst, tolerance_f32); -} - -TEST_SUITE_END() // Validate -TEST_SUITE_END() // ClCompositeKernel -TEST_SUITE_END() // DYNAMIC_FUSION -TEST_SUITE_END() // UNIT -TEST_SUITE_END() // CL -} // namespace validation -} // namespace test -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp deleted file mode 100644 index 1824efff99..0000000000 --- a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/experimental/DependencyGraph.h" - -#include "tests/framework/Asserts.h" -#include "tests/framework/Macros.h" - -using namespace arm_compute::experimental::dynamic_fusion; - -namespace arm_compute -{ -namespace test -{ -namespace validation -{ -TEST_SUITE(CL) - -TEST_SUITE(UNIT) -TEST_SUITE(DYNAMIC_FUSION) -TEST_SUITE(DependencyGraph) - -TEST_CASE(Correct_Graph_Creation_Should_Pass, framework::DatasetMode::ALL) -{ - DependencyGraph graph{}; - const auto t0 = graph.add_tensor(); - const auto t1 = graph.add_tensor(); - const auto t2 = graph.add_tensor(); - const auto t3 = graph.add_tensor(); - const auto t4 = graph.add_tensor(); - - const auto o0 = graph.add_operator({ t0, t1 }, { t2 }).second; - const auto o1 = graph.add_operator({ t3, t2 }, { t4 }).second; - - ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_ops(), 2U, framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_tensors(), 5U, framework::LogLevel::ERRORS); - - const DependencyGraph ref_graph - { - { - // src_tensors - { o0, { t0, t1 } }, - { o1, { t3, t2 } }, - }, - { - // dst_tensors - { o0, { t2 } }, - { o1, { t4 } }, - }, - { - // src_ops - { t0, {} }, - { t1, {} }, - { t2, { o0 } }, - { t3, {} }, - { t4, { o1 } }, - }, - { - // dst_ops - { t0, { o0 } }, - { t1, { o0 } }, - { t2, { o1 } }, - { t3, { o1 } }, - { t4, {} }, - } - - }; - ARM_COMPUTE_EXPECT(graph == ref_graph, framework::LogLevel::ERRORS); -} - -TEST_CASE(Correct_Merge_Points_Should_Enable_Graph_Expansion, framework::DatasetMode::ALL) -{ - // Merge points are a simple way to collapse "graph of graphs" into a single graph - // Suppose we have a top-level graph g0 - DependencyGraph g0{}; - const auto g0_t0 = g0.add_tensor(); - const auto g0_t1 = g0.add_tensor(); - const auto g0_t2 = g0.add_tensor(); - const auto g0_t3 = g0.add_tensor(); - const auto g0_t4 = g0.add_tensor(); - g0.add_operator({ g0_t0, g0_t1 }, { g0_t2 }); // g0_o0 - g0.add_operator({ g0_t3, g0_t2 }, { g0_t4 }); // g0_o1 - - // Then g0 expands into g1, with additional nodes added in-between "merge point tensors" - // Note that the expansion logic may be local to each operator node - DependencyGraph g1{}; - // g0_o0 expands into g1_o0, g1_o1, g1_o2 - const auto g1_t0 = g1.add_tensor(g0_t0); - const auto g1_t1 = g1.add_tensor(g0_t1); - const auto g1_t2 = g1.add_tensor(); - const auto g1_t3 = g1.add_tensor(); - const auto g1_t4 = g1.add_tensor(g0_t2); - const auto g1_o0 = g1.add_operator({ g1_t0 }, { g1_t2 }).second; - const auto g1_o1 = g1.add_operator({ g1_t1 }, { g1_t3 }).second; - const auto g1_o2 = g1.add_operator({ g1_t2, g1_t3 }, { g1_t4 }).second; - - // g0_o1 expands into g1_o3 - const auto g1_t5 = g1.add_tensor(g0_t3); - const auto g1_t6 = g1.add_tensor(g0_t2); - const auto g1_t7 = g1.add_tensor(g0_t4); - ARM_COMPUTE_EXPECT_EQUAL(g1_t4, g1_t6, framework::LogLevel::ERRORS); // both associate with the same merge point g0_t2, thus they should point to the same tensor in g1 - const auto g1_o3 = g1.add_operator({ g1_t5, g1_t6 }, { g1_t7 }).second; - - const DependencyGraph ref_graph - { - { - // src_tensors - { g1_o0, { g1_t0 } }, - { g1_o1, { g1_t1 } }, - { g1_o2, { g1_t2, g1_t3 } }, - { g1_o3, { g1_t5, g1_t4 } }, - }, - { - // dst_tensors - { g1_o0, { g1_t2 } }, - { g1_o1, { g1_t3 } }, - { g1_o2, { g1_t4 } }, - { g1_o3, { g1_t7 } }, - }, - { - // src_ops - { g1_t0, {} }, - { g1_t1, {} }, - { g1_t2, { g1_o0 } }, - { g1_t3, { g1_o1 } }, - { g1_t4, { g1_o2 } }, - { g1_t5, {} }, - { g1_t7, { g1_o3 } }, - }, - { - // dst_ops - { g1_t0, { g1_o0 } }, - { g1_t1, { g1_o1 } }, - { g1_t2, { g1_o2 } }, - { g1_t3, { g1_o2 } }, - { g1_t4, { g1_o3 } }, - { g1_t5, { g1_o3 } }, - { g1_t7, {} }, - }, - { - // merge points - { g0_t0, g1_t0 }, - { g0_t1, g1_t1 }, - { g0_t2, g1_t4 }, - { g0_t3, g1_t5 }, - { g0_t4, g1_t7 }, - } - }; - ARM_COMPUTE_EXPECT(g1 == ref_graph, framework::LogLevel::ERRORS); -} - -TEST_CASE(Path_Existence_Check_0, framework::DatasetMode::ALL) -{ - DependencyGraph graph{}; - const auto t0 = graph.add_tensor(); - const auto t1 = graph.add_tensor(); - const auto t2 = graph.add_tensor(); - const auto t3 = graph.add_tensor(); - const auto t4 = graph.add_tensor(); - const auto t5 = graph.add_tensor(); - const auto t6 = graph.add_tensor(); - const auto t7 = graph.add_tensor(); - const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second; - const auto o1 = graph.add_operator({ t3 }, { t5 }).second; - const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second; - const auto o3 = graph.add_operator({ t4 }, { t6 }).second; - const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second; - - ARM_COMPUTE_UNUSED(o1, o3); - - ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t3, o2)), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t1, o4)), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t2, o4)), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t0, o2)), framework::LogLevel::ERRORS); - - ARM_COMPUTE_EXPECT((graph.path_exists_from_op_to_op(o0, o2)), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o0)), framework::LogLevel::ERRORS); - - ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o4)), framework::LogLevel::ERRORS); -} - -TEST_CASE(Correct_Topological_Sort_Should_Pass, framework::DatasetMode::ALL) -{ - DependencyGraph graph{}; - const auto t0 = graph.add_tensor(); - const auto t1 = graph.add_tensor(); - const auto t2 = graph.add_tensor(); - const auto t3 = graph.add_tensor(); - const auto t4 = graph.add_tensor(); - const auto t5 = graph.add_tensor(); - const auto t6 = graph.add_tensor(); - const auto t7 = graph.add_tensor(); - const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second; - const auto o1 = graph.add_operator({ t3 }, { t5 }).second; - const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second; - const auto o3 = graph.add_operator({ t4 }, { t6 }).second; - const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second; - - const auto res = graph.topological_sort(); - ARM_COMPUTE_EXPECT(bool(res.first), framework::LogLevel::ERRORS); - std::vector ref_sorted_op_packs - { - { o0, { t1 }, { t3, t4 } }, - { o1, { t3 }, { t5 } }, - { o3, { t4 }, { t6 } }, - { o4, { t0, t5 }, { t2 } }, - { o2, { t5, t6 }, { t7 } }, - - }; - ARM_COMPUTE_EXPECT((res.second == ref_sorted_op_packs), framework::LogLevel::ERRORS); -} - -TEST_CASE(Cycles_Should_Fail, framework::DatasetMode::ALL) -{ - DependencyGraph graph{}; - const auto t0 = graph.add_tensor(); - const auto t1 = graph.add_tensor(); - const auto t2 = graph.add_tensor(); - const auto t3 = graph.add_tensor(); - - graph.add_operator({ t0, t1 }, { t2 }); - graph.add_operator({ t2 }, { t1, t3 }); // Ideally error should occur here - - const auto res = graph.topological_sort(); - ARM_COMPUTE_EXPECT(!bool(res.first), framework::LogLevel::ERRORS); -} -TEST_CASE(Loops_Should_Fail, framework::DatasetMode::ALL) -{ - DependencyGraph graph{}; - const auto t0 = graph.add_tensor(); - const auto t1 = graph.add_tensor(); - const auto t2 = graph.add_tensor(); - - ARM_COMPUTE_EXPECT_THROW(graph.add_operator({ t0, t2 }, { t1, t2 }).first, framework::LogLevel::ERRORS); - ARM_COMPUTE_UNUSED(t0, t1, t2); -} -TEST_SUITE_END() // DependencyGraph -TEST_SUITE_END() // DYNAMIC_FUSION -TEST_SUITE_END() // UNIT - -TEST_SUITE_END() // CL -} // namespace validation -} // namespace test -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp deleted file mode 100644 index 2b8f69e5e7..0000000000 --- a/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/TensorInfo.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/experimental/ClWorkload.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/experimental/ClCompositeOperator.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" -#include "tests/CL/CLAccessor.h" -#include "tests/framework/Asserts.h" -#include "tests/framework/Macros.h" -#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" -#include "tests/validation/Validation.h" - -#include "tests/validation/reference/Floor.h" -#include "tests/validation/reference/Permute.h" - -#ifdef ARM_COMPUTE_ASSERTS_ENABLED -#include "tests/SimpleTensorPrinter.h" -#endif /* ARM_COMPUTE_ASSERTS_ENABLED */ - -using namespace arm_compute::experimental::dynamic_fusion; -using namespace arm_compute::test::validation::utils; - -namespace arm_compute -{ -namespace test -{ -namespace validation -{ -TEST_SUITE(CL) -TEST_SUITE(UNIT) -TEST_SUITE(DYNAMIC_FUSION) -TEST_CASE(Operator_Floor_1_F32, framework::DatasetMode::ALL) -{ - /* Computation: - * out = floor(input) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - const auto t_shape = TensorShape(32, 16); - auto t_input_info = TensorInfo(t_shape, 1, data_type, data_layout); - auto t_dst_info = TensorInfo(); - - FloorDescriptor floor_desc{}; - - // Create reference - SimpleTensor ref_t_input{ t_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - - // Fill reference - fill(ref_t_input, 0, library.get()); - - auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U)); - auto t_dst_shape_nchw = t_shape; - permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U)); - - auto ref_t_dst_nchw = reference::floor_layer(ref_t_input_nchw); - const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U)); - - CLScheduler::get().default_reinit(); - const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); - OperatorGraph op_graph; - - const auto op_t_input = add_tensor(op_graph, t_input_info); - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - add_op_floor(op_graph, floor_desc, op_t_input, op_t_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - build(workload, op_graph, workload_ctx); - - ClCompositeOperator op; - op.configure(cl_compile_ctx, workload); - - // Construct tensors - CLTensor t_input{}; - CLTensor t_dst{}; - - // Init tensors - t_input.allocator()->init(t_input_info); - t_dst.allocator()->init(t_dst_info); - - // Allocate and fill tensors - t_input.allocator()->allocate(); - t_dst.allocator()->allocate(); - fill(CLAccessor(t_input), 0, library.get()); - // "Pack" tensors - OpTensorBinding bp_tensors({ { op_t_input, &t_input }, - { op_t_dst, &t_dst } - }); - - // Populate prepare and run pack-maps (including allocating aux tensors) - ClAuxTensorData aux_tensor_data{}; - TensorPackMap prepare_pack_map{}; - TensorPackMap run_pack_map{}; - bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); - - op.prepare(prepare_pack_map); - op.run(run_pack_map); - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); -} - -TEST_SUITE_END() // DYNAMIC_FUSION -TEST_SUITE_END() // UNIT -TEST_SUITE_END() // CL -} // namespace validation -} // namespace test -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp deleted file mode 100644 index 3a8b7c8ce8..0000000000 --- a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION -#include "arm_compute/core/TensorInfo.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/core/experimental/ClWorkload.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/experimental/ClCompositeOperator.h" -#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h" -#include "src/gpu/cl/operators/ClAdd.h" -#include "src/gpu/cl/operators/ClConv2d.h" -#include "tests/CL/CLAccessor.h" -#include "tests/framework/Asserts.h" -#include "tests/framework/Macros.h" -#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" -#include "tests/validation/Validation.h" - -#include "tests/validation/reference/ConvolutionLayer.h" -#include "tests/validation/reference/ElementwiseOperations.h" -#include "tests/validation/reference/Permute.h" - -#ifdef ARM_COMPUTE_ASSERTS_ENABLED -#include "tests/SimpleTensorPrinter.h" -#endif /* ARM_COMPUTE_ASSERTS_ENABLED */ - -using namespace arm_compute::experimental::dynamic_fusion; -using namespace arm_compute::test::validation::utils; - -namespace arm_compute -{ -namespace test -{ -namespace validation -{ -TEST_SUITE(CL) -TEST_SUITE(INTEGRATION) -TEST_SUITE(DYNAMIC_FUSION) -TEST_CASE(Operator_Fuse_Movenet_SubGraph_1_F32, framework::DatasetMode::ALL) -{ - // Please refer to: https://confluence.arm.com/pages/viewpage.action?pageId=886243697 - /* Computation: - * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias)) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - const auto t_input_shape = TensorShape(384, 12, 12); - // const auto t_weight_shape = TensorShape(384, 1, 1, 64); - // const auto t_dst_shape = TensorShape(64, 12, 12); - const auto t_weight_shape = TensorShape(384, 1, 1, 16); - const auto t_dst_shape = TensorShape(16, 12, 12); - auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); - auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); - auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); - auto t_acc_info = TensorInfo(); // Intermediate tensor for cond3 - auto t_dst_info = TensorInfo(); - - Conv2dDescriptor conv2d_desc{}; - ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD }; - - // Create reference - SimpleTensor ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - SimpleTensor ref_t_l1_addend{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; - - // Fill reference - fill(ref_t_input, 0, library.get()); - fill(ref_t_weight, 1, library.get()); - fill(ref_t_l1_addend, 2, library.get()); - - auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U)); - auto ref_t_weight_nchw = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U)); - auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U)); - auto ref_t_l1_addend_nchw = reference::permute(ref_t_l1_addend, PermutationVector(1U, 2U, 0U)); - auto t_dst_shape_nchw = t_dst_shape; - permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U)); - - PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{}); - auto ref_t_dst_nchw = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend_nchw, - reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation), - data_type, - ConvertPolicy{}); - const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U)); - - CLScheduler::get().default_reinit(); - const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); - OperatorGraph op_graph; - - const auto op_t_input = add_tensor(op_graph, t_input_info); - const auto op_t_weight = add_tensor(op_graph, t_weight_info); - const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); - const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc); - force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); - add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - build(workload, op_graph, workload_ctx); - - ClCompositeOperator op; - op.configure(cl_compile_ctx, workload); - - // Construct tensors - CLTensor t_input{}; - CLTensor t_weight{}; - CLTensor t_l1_addend{}; - CLTensor t_dst{}; - - // Init tensors - t_input.allocator()->init(t_input_info); - t_weight.allocator()->init(t_weight_info); - t_l1_addend.allocator()->init(t_dst_info); - t_dst.allocator()->init(t_dst_info); - - // Allocate and fill tensors - t_input.allocator()->allocate(); - t_weight.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - fill(CLAccessor(t_input), 0, library.get()); - fill(CLAccessor(t_weight), 1, library.get()); - fill(CLAccessor(t_l1_addend), 2, library.get()); - // "Pack" tensors - OpTensorBinding bp_tensors({ { op_t_input, &t_input }, - { op_t_weight, &t_weight }, - { op_t_l1_addend, &t_l1_addend }, - { op_t_dst, &t_dst } - }); - - // Populate prepare and run pack-maps (including allocating aux tensors) - ClAuxTensorData aux_tensor_data{}; - TensorPackMap prepare_pack_map{}; - TensorPackMap run_pack_map{}; - bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors); - - op.prepare(prepare_pack_map); - op.run(run_pack_map); - RelativeTolerance tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32); -} -TEST_SUITE(Unsupported) -TEST_CASE(DataType_QASYMM8, framework::DatasetMode::ALL) -{ - const auto data_type = DataType::QASYMM8; - const auto data_layout = DataLayout::NHWC; - const auto t_input_shape = TensorShape(384, 12, 12); - const auto t_weight_shape = TensorShape(384, 1, 1, 64); - const auto t_dst_shape = TensorShape(64, 12, 12); - auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); - auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); - auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); - auto t_acc_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); - - Conv2dDescriptor conv2d_desc{}; - ElementwiseDescriptor add_desc{}; - - OperatorGraph op_graph; - - const auto op_t_input = add_tensor(op_graph, t_input_info); - const auto op_t_weight = add_tensor(op_graph, t_weight_info); - const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info); - const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc); - add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst); - force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - const auto success = build(workload, op_graph, workload_ctx); - - ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); -} -TEST_CASE(DataLayout_NCHW, framework::DatasetMode::ALL) -{ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NCHW; - const auto t_input_shape = TensorShape(384, 12, 12); - const auto t_weight_shape = TensorShape(384, 1, 1, 64); - const auto t_dst_shape = TensorShape(64, 12, 12); - auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout); - auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); - - Conv2dDescriptor conv2d_desc{}; - - OperatorGraph op_graph; - - const auto op_t_input = add_tensor(op_graph, t_input_info); - const auto op_t_weight = add_tensor(op_graph, t_weight_info); - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_dst); - force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - const auto success = build(workload, op_graph, workload_ctx); - - ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); -} -TEST_SUITE_END() // Unsupported - -TEST_SUITE(Invalid) -TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL) -{ - /* Computation: - * out = conv2d(conv2d(l0_input, l0_weight), l1_weight) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - const auto t_l0_input_shape = TensorShape(1024, 56, 56); - const auto t_l0_weight_shape = TensorShape(512, 1024, 1, 1); - const auto t_l1_weight_shape = TensorShape(512, 256, 1, 1); - - auto t_l0_input_info = TensorInfo(t_l0_input_shape, 1, data_type, data_layout); - auto t_l0_weight_info = TensorInfo(t_l0_weight_shape, 1, data_type, data_layout); - auto t_l1_weight_info = TensorInfo(t_l1_weight_shape, 1, data_type, data_layout); - auto t_l0_dst_info = TensorInfo(); - auto t_dst_info = TensorInfo(); - - OperatorGraph op_graph; - const auto conv2d_desc = Conv2dDescriptor{}; - - const auto op_t_l0_input = add_tensor(op_graph, t_l0_input_info); - const auto op_t_l0_weight = add_tensor(op_graph, t_l0_weight_info); - const auto op_t_l1_weight = add_tensor(op_graph, t_l1_weight_info); - const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - add_op_conv2d(op_graph, conv2d_desc, op_t_l0_input, op_t_l0_weight, op_t_l0_dst); - add_op_conv2d(op_graph, conv2d_desc, op_t_l0_dst, op_t_l1_weight, op_t_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - const auto success = build(workload, op_graph, workload_ctx); - - ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); -} -TEST_CASE(Enlarging_Execution_Space, framework::DatasetMode::ALL) -{ - /* Computation: - * out = add(l2_lhs, add(add(l0_lhs, l0_rhs), l1_rhs)) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - const auto t_l0_lhs_shape = TensorShape(1, 256, 3); - const auto t_l0_rhs_shape = TensorShape(1, 256, 3); - const auto t_l1_rhs_shape = TensorShape(1, 1, 3); - const auto t_l2_lhs_shape = TensorShape(1024, 1, 3); - - auto t_l0_lhs_info = TensorInfo(t_l0_lhs_shape, 1, data_type, data_layout); - auto t_l0_rhs_info = TensorInfo(t_l0_rhs_shape, 1, data_type, data_layout); - auto t_l1_rhs_info = TensorInfo(t_l1_rhs_shape, 1, data_type, data_layout); - auto t_l2_lhs_info = TensorInfo(t_l2_lhs_shape, 1, data_type, data_layout); - auto t_l0_dst_info = TensorInfo(); - auto t_l1_dst_info = TensorInfo(); - auto t_dst_info = TensorInfo(); - - OperatorGraph op_graph; - const auto add_desc = ElementwiseDescriptor{}; - - const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info); - const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info); - const auto op_t_l1_rhs = add_tensor(op_graph, t_l1_rhs_info); - const auto op_t_l2_lhs = add_tensor(op_graph, t_l2_lhs_info); - const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - add_op_elementwise_op(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst); - add_op_elementwise_op(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst); - add_op_elementwise_op(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - const auto success = build(workload, op_graph, workload_ctx); - - ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); -} -TEST_CASE(Root_Simple_And_Complex, framework::DatasetMode::ALL) -{ - /* Computation: - * out = add(conv(l0_0_input, l0_0_weight), add(l0_1_lhs, l0_1_rhs)) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - - const auto t_l0_0_input_shape = TensorShape(128, 21, 21); - const auto t_l0_0_weight_shape = TensorShape(144, 128, 1, 1); - const auto t_l0_1_lhs_shape = TensorShape(144, 21, 21); - const auto t_l0_1_rhs_shape = TensorShape(1, 1, 21); - - auto t_l0_0_input_info = TensorInfo(t_l0_0_input_shape, 1, data_type, data_layout); - auto t_l0_0_weight_info = TensorInfo(t_l0_0_weight_shape, 1, data_type, data_layout); - auto t_l0_1_lhs_info = TensorInfo(t_l0_1_lhs_shape, 1, data_type, data_layout); - auto t_l0_1_rhs_info = TensorInfo(t_l0_1_rhs_shape, 1, data_type, data_layout); - auto t_l0_0_dst_info = TensorInfo(); - auto t_l0_1_dst_info = TensorInfo(); - auto t_dst_info = TensorInfo(); - - OperatorGraph op_graph; - const auto conv2d_desc = Conv2dDescriptor{}; - const auto add_desc = ElementwiseDescriptor{}; - - const auto op_t_l0_0_input = add_tensor(op_graph, t_l0_0_input_info); - const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info); - const auto op_t_l0_1_lhs = add_tensor(op_graph, t_l0_1_lhs_info); - const auto op_t_l0_1_rhs = add_tensor(op_graph, t_l0_1_rhs_info); - const auto op_t_l0_0_dst = add_tensor(op_graph, t_l0_0_dst_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_l0_1_dst = add_tensor(op_graph, t_l0_1_dst_info); // temp accumulator; TensorInfo to be inferred - const auto op_t_dst = add_tensor(op_graph, t_dst_info); - - add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst); - add_op_elementwise_op(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst); - add_op_elementwise_op(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - const auto success = build(workload, op_graph, workload_ctx); - - ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); -} -TEST_CASE(Loop, framework::DatasetMode::ALL) -{ - /* Computation: - * tensor state0; - * state1 = add(l0_lhs, state0) - * state0 = add(l1_lhs, state1) - */ - const auto data_type = DataType::F32; - const auto data_layout = DataLayout::NHWC; - - const auto t_shape = TensorShape(13, 21); - - auto t_l0_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout); - auto t_l1_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout); - auto state0_info = TensorInfo(t_shape, 1, data_type, data_layout); - auto state1_info = TensorInfo(); - - OperatorGraph op_graph; - const auto conv2d_desc = Conv2dDescriptor{}; - const auto add_desc = ElementwiseDescriptor{}; - - const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info); - const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info); - const auto op_t_state0 = add_tensor(op_graph, state0_info); - const auto op_t_state1 = add_tensor(op_graph, state1_info); - - add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1); - add_op_elementwise_op(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0); - - const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } }; - ClWorkload workload; - const auto success = build(workload, op_graph, workload_ctx); - - ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS); - ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS); -} -TEST_SUITE_END() // Invalid - -TEST_SUITE_END() // DYNAMIC_FUSION -TEST_SUITE_END() // INTEGRATION -TEST_SUITE_END() // CL -} // namespace validation -} // namespace test -} // namespace arm_compute -#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h b/tests/validation/CL/UNIT/dynamic_fusion/Utils.h deleted file mode 100644 index 4512305c1e..0000000000 --- a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2022 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS -#define TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS - -#include "tests/AssetsLibrary.h" -#include "utils/Utils.h" - -#include -#include -#include - -namespace arm_compute -{ -namespace test -{ -namespace validation -{ -namespace utils -{ -/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name - * - */ -#define TICK(clock_name) \ - auto clock_name##_tick = std::chrono::high_resolution_clock::now(); -#define TOCK(clock_name, measurement_map) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); -#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); - -template -void fill(U &&tensor, int seed, AssetsLibrary *library) -{ - static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); - using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; - - DistributionType distribution{ T(-1.0f), T(1.0f) }; - library->fill(tensor, distribution, seed); - - // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) - DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; - library->fill_borders_with_garbage(tensor, distribution_inf, seed); -} -} // namespace utils -} // namespace validation -} // namespace test -} // namespace arm_compute -#endif //TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS \ No newline at end of file diff --git a/tests/validation/dynamic_fusion/Utils.h b/tests/validation/dynamic_fusion/Utils.h new file mode 100644 index 0000000000..72e9ec5955 --- /dev/null +++ b/tests/validation/dynamic_fusion/Utils.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_VALIDATION_DYNAMIC_FUSION_UTILS +#define TESTS_VALIDATION_DYNAMIC_FUSION_UTILS + +#include "tests/AssetsLibrary.h" +#include "utils/Utils.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace utils +{ +/** A pair of macros which measures the wall clock time, and records it into a map measurement_map with name clock_name + * + */ +#define TICK(clock_name) \ + auto clock_name##_tick = std::chrono::high_resolution_clock::now(); +#define TOCK(clock_name, measurement_map) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast(clock_name##_tock - clock_name##_tick); +#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ + auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ + measurement_map["\"" #clock_name "\""] = duration_cast((clock_name##_tock - clock_name##_tick) / (num_iterations)); + +template +void fill(U &&tensor, int seed, AssetsLibrary *library) +{ + static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); + using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; + + DistributionType distribution{ T(-1.0f), T(1.0f) }; + library->fill(tensor, distribution, seed); + + // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) + DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; + library->fill_borders_with_garbage(tensor, distribution_inf, seed); +} +} // namespace utils +} // namespace validation +} // namespace test +} // namespace arm_compute + +#endif /* TESTS_VALIDATION_DYNAMIC_FUSION_UTILS */ diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp index 6ee2e20d35..036f28b29f 100644 --- a/tests/validation/dynamic_fusion/gpu/Integration.cpp +++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp @@ -36,8 +36,8 @@ #include "tests/CL/CLAccessor.h" #include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" -#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" #include "tests/validation/Validation.h" +#include "tests/validation/dynamic_fusion/Utils.h" #include "tests/validation/reference/ConvolutionLayer.h" #include "tests/validation/reference/ElementwiseOperations.h" #include "tests/validation/reference/Permute.h" -- cgit v1.2.1