aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiCong Li <sicong.li@arm.com>2022-11-09 15:57:48 +0000
committerSiCong Li <sicong.li@arm.com>2022-11-22 14:09:34 +0000
commit31df05a1870662a7288fbaeb6fbc7fc458bb5a73 (patch)
treee75a132b8b5fd21cbceec8d0aa88da893e9c4f43
parent73bb6b7ad80801e56633ad4ea12b0404b586a979 (diff)
downloadComputeLibrary-31df05a1870662a7288fbaeb6fbc7fc458bb5a73.tar.gz
Remove dynamic fusion prototype with tests and examples
Public headers of the new experimental dynamic fusion can be found in arm_compute/dynamic_fusion/ New examples on how to use the interface can be found in tests/validation/dynamic_fusion/gpu/Integration.cpp Resolves COMPMID-5683 Change-Id: I7ccb902a227fb487562df15fc3c30118d1d95bbd Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8671 Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp13
-rw-r--r--SConscript1
-rw-r--r--arm_compute/core/experimental/ClWorkload.h219
-rw-r--r--arm_compute/core/experimental/DependencyGraph.h277
-rw-r--r--arm_compute/core/experimental/IWorkload.h132
-rw-r--r--arm_compute/core/experimental/OperatorGraph.h217
-rw-r--r--arm_compute/runtime/CL/CLScheduler.h31
-rw-r--r--arm_compute/runtime/CL/CLTuner.h4
-rw-r--r--arm_compute/runtime/CL/ICLTuner.h18
-rw-r--r--arm_compute/runtime/experimental/ClCompositeOperator.h190
-rw-r--r--docs/Doxyfile3
-rw-r--r--examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp392
-rw-r--r--examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp238
-rwxr-xr-xscripts/clang_tidy_rules.py1
-rw-r--r--src/core/CL/ICLKernel.h21
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp164
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h122
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h930
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h76
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp409
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h81
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp266
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h90
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp153
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h85
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h35
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp171
-rw-r--r--src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h97
-rw-r--r--src/core/experimental/dynamic_fusion/OperatorGraph.cpp264
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp232
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h452
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h121
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp271
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h259
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp72
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp430
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h241
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp423
-rw-r--r--src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h252
-rw-r--r--src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp200
-rw-r--r--src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h76
-rw-r--r--src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp241
-rw-r--r--src/runtime/CL/CLScheduler.cpp41
-rw-r--r--src/runtime/CL/CLTuner.cpp32
-rw-r--r--tests/SConscript1
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp394
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp215
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp266
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp135
-rw-r--r--tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp402
-rw-r--r--tests/validation/dynamic_fusion/Utils.h (renamed from tests/validation/CL/UNIT/dynamic_fusion/Utils.h)8
-rw-r--r--tests/validation/dynamic_fusion/gpu/Integration.cpp2
52 files changed, 7 insertions, 9459 deletions
diff --git a/Android.bp b/Android.bp
index 89a7a43060..69a0affdb2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -373,17 +373,6 @@ cc_library_static {
"src/core/Utils.cpp",
"src/core/Validate.cpp",
"src/core/Version.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp",
- "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp",
- "src/core/experimental/dynamic_fusion/OperatorGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp",
- "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp",
"src/core/helpers/SoftmaxHelpers.cpp",
"src/core/helpers/WindowHelpers.cpp",
"src/core/utils/AssemblyUtils.cpp",
@@ -667,7 +656,6 @@ cc_library_static {
"src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp",
"src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigBifrost.cpp",
"src/gpu/cl/kernels/direct_conv/ClDirectConvDefaultConfigValhall.cpp",
- "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp",
"src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp",
"src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp",
"src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp",
@@ -711,7 +699,6 @@ cc_library_static {
"src/gpu/cl/operators/ClTranspose.cpp",
"src/gpu/cl/operators/ClTransposedConvolution.cpp",
"src/gpu/cl/operators/ClWinogradConv2d.cpp",
- "src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp",
"src/runtime/Allocator.cpp",
"src/runtime/BlobLifetimeManager.cpp",
"src/runtime/BlobMemoryPool.cpp",
diff --git a/SConscript b/SConscript
index 42a03f0a04..908fbff626 100644
--- a/SConscript
+++ b/SConscript
@@ -507,7 +507,6 @@ lib_files = filelist['common']
# Dynamic fusion
if env['experimental_dynamic_fusion']:
lib_files += filelist['experimental']['dynamic_fusion']
- arm_compute_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
# Fixed format GEMM kernels.
if env['experimental_fixed_format_kernels']:
diff --git a/arm_compute/core/experimental/ClWorkload.h b/arm_compute/core/experimental/ClWorkload.h
deleted file mode 100644
index 9b2040a046..0000000000
--- a/arm_compute/core/experimental/ClWorkload.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Window.h"
-
-#include "arm_compute/core/experimental/IWorkload.h"
-#include "arm_compute/core/experimental/OperatorGraph.h"
-
-#include <map>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Verbose and explicit way to enumerate all the tensor arguments variants used by
- * all kernel implementations. This avoids any ambiguity in what kernel arguments are passed
- */
-enum class ClKernelTensorArgType : int
-{
- Scalar,
-
- Vector,
-
- Image,
- Image_Reinterpret_As_3D,
- Image_Export_To_ClImage2D,
-
- Image_3D, // 3D Tensor represented as a 2D Image + stride_z
- Image_3D_Export_To_ClImage2D,
-
- Tensor_3D,
- Tensor_4D,
- Tensor_4D_t_Buffer,
- Tensor_4D_t_Image
-};
-
-/** Describes all the info required to add a kernel argument at run time
- *
- * @note This struct can later be expanded into a more concise and formal way to specify how to set up
- * arguments for a kernel inside a @ref ClUnitWorkload
- */
-struct ClKernelArgDescriptor
-{
- ClKernelArgDescriptor() = default;
- ClKernelArgDescriptor(int arg_id, ClKernelTensorArgType type, bool slide_along_dimz = true)
- : arg_id{ arg_id }, tensor_arg_type{ type }, slide_along_dimz{ slide_along_dimz }
- {
- }
- ~ClKernelArgDescriptor() = default;
- friend bool operator==(const ClKernelArgDescriptor &arg0, const ClKernelArgDescriptor &arg1)
- {
- return (arg0.tensor_arg_type == arg1.tensor_arg_type) && (arg0.slide_along_dimz == arg1.slide_along_dimz);
- }
- int arg_id{ -1 }; /**< Arg ID in the blueprint, -1 means empty / uninitialized */
- ClKernelTensorArgType tensor_arg_type{ ClKernelTensorArgType::Image }; /**< tensor argument type */
- bool slide_along_dimz{ true }; /**< @note slide_along_dimz will be moved out of this descriptor in later iterations */
-};
-
-using ClKernelArgList = std::map<int, ClKernelArgDescriptor>;
-
-/** Descriptor containing information required to run a single ClWorkload
- */
-struct ClExecutionDescriptor
-{
- cl::NDRange suggested_lws{}; /**< Suggested local work-group size for optimal performance if not zero */
- cl::NDRange gws{}; /**< Global work-group to be used */
- bool skip_sliding_window{ false }; /**< Skip sliding window slices during execution loop */
-};
-
-/** Contains kernel code to be compiled and run in a ClUnitWorkload
- */
-struct ClKernelCode
-{
- friend bool operator==(const ClKernelCode &code0, const ClKernelCode &code1)
- {
- return (code0.name == code1.name) && (code0.code == code1.code) && (code0.config_id == code1.config_id) && (code0.build_options == code1.build_options) && (code0.window == code1.window)
- && (code0.arguments == code1.arguments);
- }
- std::string name{}; /**< Kernel name */
- std::string code{}; /**< Kernel source code */
- std::string config_id{}; /**< Generated from blueprint based on complex component */
- CLBuildOptions build_options{}; /**< Kernel build options */
- Window window{}; /**< Execution window */
- ClKernelArgList arguments{}; /**< Kernel argument descriptors. map key is kernel ArgumentID */
-};
-
-/** A descriptor of ClWorkload Tensors.
- */
-struct ClWorkloadTensor : public WorkloadTensor
-{
- ClWorkloadTensor() = default;
- ClWorkloadTensor(Id id, ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg)
- : WorkloadTensor{ id, info, memory_type, memory_info }, kernel_arg{ kernel_arg }
- {
- }
- ClKernelArgDescriptor kernel_arg{};
- friend bool operator==(const ClWorkloadTensor &t0, const ClWorkloadTensor &t1)
- {
- return t0.info == t1.info && t0.memory_info == t1.memory_info && t0.memory_type == t1.memory_type && t0.kernel_arg == t1.kernel_arg;
- }
-};
-
-/** The basic atomic unit in a @ref ClWorkload. It contains exactly one kernel to run.
- */
-struct ClUnitWorkload : public UnitWorkload
-{
- ClUnitWorkload() = default;
- ClUnitWorkload(Id id, UnitWorkloadStage stage, const ClKernelCode &code)
- : UnitWorkload{ id, stage }, code{ code }
- {
- }
- friend bool operator==(const ClUnitWorkload &uworkload0, const ClUnitWorkload &uworkload1)
- {
- return uworkload0.stage == uworkload1.stage && uworkload0.code == uworkload1.code;
- }
- ClKernelCode code{};
-};
-
-/** GPU information for @ref ClWorkloadContext
- */
-struct GpuInfo
-{
- friend bool operator==(const GpuInfo &info0, const GpuInfo &info1)
- {
- return info0.target == info1.target;
- }
- GPUTarget target{ GPUTarget::UNKNOWN };
-};
-
-/** Context (device capabilities, platform details) associated with a ClWorkload
- *
- * It is required for building the @ref ClKernelCode and could also be used by the runtime (e.g. schedulers)
- */
-struct ClWorkloadContext
-{
- friend bool operator==(const ClWorkloadContext &ctx0, const ClWorkloadContext &ctx1)
- {
- return ctx0.gpu_info == ctx1.gpu_info;
- }
- GpuInfo gpu_info{};
-};
-
-/** Workload for Cl backend
- */
-struct ClWorkload : public IWorkload
-{
- Tid add_workload_tensor(ITensorInfo *info, MemoryType memory_type, const AuxMemoryInfo &memory_info, const ClKernelArgDescriptor &kernel_arg, Tid merge_point)
- {
- Tid id = graph.add_tensor(merge_point);
- if(tensors.find(id) == tensors.end())
- {
- tensors[id] = ClWorkloadTensor(id, info, memory_type, memory_info, kernel_arg);
- }
- return id;
- }
- UnitWorkId add_unit_workload(UnitWorkloadStage stage, const ClKernelCode &code, const std::vector<Tid> &inputs, const std::vector<Tid> &outputs)
- {
- auto op = graph.add_operator(inputs, outputs);
- auto id = op.second;
- unit_workloads[id] = ClUnitWorkload(id, stage, code);
- return id;
- }
- friend bool operator==(const ClWorkload &workload0, const ClWorkload &workload1)
- {
- return std::make_tuple(
- workload0.graph, workload0.context, workload0.unit_workloads, workload0.tensors, workload0.op_tensor_id_lut)
- == std::make_tuple(
- workload1.graph, workload1.context, workload1.unit_workloads, workload1.tensors, workload1.op_tensor_id_lut);
- }
- ClWorkloadContext context{}; /**< Workload context*/
- std::map<UnitWorkId, ClUnitWorkload> unit_workloads{}; /**< Unit workloads to run*/
- std::map<Tid, ClWorkloadTensor> tensors{}; /**< Workload tensors*/
- std::map<Tid, OpTensor::Id> op_tensor_id_lut{}; /**< Map from ClWorkloadTensor to SRC and DST Operator Tensors (no need to store "intermediate" Operator Tensors)*/
- Status status{}; /**< For compatibility with the IOperator validate method. Store if the workload is valid or not. */
-};
-
-/** Build a @ref ClWorkload from an @ref OperatorGraph.
- *
- * @param[out] workload
- * @param[in] op_graph
- * @param[in] ctx
- * @return Status
- */
-Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx);
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLWORKLOAD_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/arm_compute/core/experimental/DependencyGraph.h b/arm_compute/core/experimental/DependencyGraph.h
deleted file mode 100644
index e0d6ff9ba9..0000000000
--- a/arm_compute/core/experimental/DependencyGraph.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
-
-#include "arm_compute/core/Error.h"
-
-#include <algorithm>
-#include <map>
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-template <typename T>
-bool is_in(const T &v, const std::vector<T> &vec)
-{
- return std::find(std::begin(vec), std::end(vec), v) != std::end(vec);
-}
-
-/** The dependency graph of a workload, where the nodes are of 2 types: Tensor or Operator
- * Represented as a doubly-linked adjacency list with the differentiation between source and destination
- *
- * A "Merge Tensor" is an external tensor associated with the tensor within the graph, and serve as a merge point
- */
-class DependencyGraph
-{
-public:
- /** A serial Id allocator
- *
- */
- class SerialIdAllocator
- {
- public:
- using Id = int;
- Id alloc()
- {
- return _counter++;
- }
- constexpr static Id empty()
- {
- return -1;
- }
-
- private:
- Id _counter{ 0 };
- };
- using Id = SerialIdAllocator::Id;
- /** Adjacency list
- *
- */
- using AdjList = std::map<Id, std::vector<Id>>;
-
- /** A pack of operator including its input and output tensors, used by traversing through the graph in topological order
- *
- */
- struct OpPack
- {
- Id op{};
- std::vector<Id> inputs{};
- std::vector<Id> outputs{};
- friend bool operator==(const OpPack &opp0, const OpPack &opp1)
- {
- return std::make_tuple(
- opp0.op, opp0.inputs, opp0.outputs)
- == std::make_tuple(
- opp1.op, opp1.inputs, opp1.outputs);
- }
- };
-
-public:
- constexpr static Id empty_id()
- {
- return SerialIdAllocator::empty();
- }
-
- DependencyGraph() = default;
- // Used in cases where two DependencyGraphs may want to share the same configuration of tensors
- explicit DependencyGraph(const std::vector<Id> &imported_tensors);
- // Testing only
- DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points = {});
-
- /** Add a new tensor
- *
- * @param merge_tensor The external merge point associated with the tensor. Leave empty if not needed.
- * @return Id The newly allocated tensor, or a previously added tensor associated with @p merge_tensor
- */
- Id add_tensor(Id merge_tensor = empty_id());
-
- void remove_tensor(Id tensor);
-
- /** Add a new operator
- *
- * @param inputs Input tensors to the operator
- * @param outputs Output tensors to the operator
- * @return std::pair<Status, DependencyGraph::Id> where id is the newly allocated operator
- */
- std::pair<Status, DependencyGraph::Id> add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs);
-
- void remove_operator(Id op);
- /** Sort the graph in a topological order
- *
- * @return std::pair<Status, std::vector<OpPack>>
- */
- std::pair<Status, std::vector<OpPack>> topological_sort() const;
-
- std::vector<Id> src_ops(Id op) const;
- std::vector<Id> dst_ops(Id op) const;
-
- std::vector<Id> src_ops_from_tensor(Id tensor) const;
- std::vector<Id> dst_ops_from_tensor(Id tensor) const;
- /** Get the merge points object
- *
- * @return std::map<Id, Id>
- */
- std::map<Id, Id> get_merge_points() const;
- /** Get all root ops. Root ops can also be referred to as "src ops" of the whole graph
- *
- * @return std::vector<Id>
- */
- std::vector<Id> get_root_ops() const;
- /** Get all dst ops of the whole graph
- *
- * @return std::vector<Id>
- */
- std::vector<Id> get_dst_ops() const;
-
- /** Get source tensors to an operator
- *
- * @param op
- * @return std::vector<Id>
- */
- std::vector<Id> src_tensors(Id op) const;
- /** Get destination tensors to an operator
- *
- * @param op
- * @return std::vector<Id>
- */
- std::vector<Id> dst_tensors(Id op) const;
- /** Get source tensors of the whole graph
- *
- * @return std::vector<Id>
- */
- std::vector<Id> src_tensors() const;
- /** Get destination tensors of the whole graph
- *
- * @return std::vector<Id>
- */
- std::vector<Id> dst_tensors() const;
- /** Get all operators
- *
- * @return std::vector<Id>
- */
- std::vector<Id> all_ops() const;
- /** Get all tensors
- *
- * @return std::vector<Id>
- */
- std::vector<Id> all_tensors() const;
- /** Number of operators
- *
- * @return unsigned int
- */
- unsigned int number_of_ops() const;
- /** Number of tensors
- *
- * @return unsigned int
- */
- unsigned int number_of_tensors() const;
-
- /** Update @p merge_point to point to @p t_id
- *
- * @param t_id
- * @param merge_point
- */
- Status update_merge_point(Id t_id, Id merge_point);
-
- /** Strict equality comparison (all internal ids and order of insertion matter).
- * In the future this may be replaced with a topological comparison, allowing equivalent graphs with different internal ids to be equal
- *
- *
- * @param g0
- * @param g1
- * @return true
- * @return false
- */
- friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
- {
- // Do not compare id allocators
- return std::make_tuple(
- g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops, g0._merge_to_internal)
- == std::make_tuple(
- g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops, g1._merge_to_internal);
- }
- void link_input(Id op, Id in_tensor);
- void link_output(Id op, Id out_tensor);
- /** Check if there's a path from @p src_tensor to @p dst_op
- *
- * @param src_tensor
- * @param dst_op
- * @return true
- * @return false
- */
- bool path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const;
- /** Check if there's a path from @p src_op to @p dst_op
- *
- * @param src_op
- * @param dst_op
- * @return true
- * @return false
- */
- bool path_exists_from_op_to_op(Id src_op, Id dst_op) const;
- /** Check if tensor is the src tensor of the entire graph
- *
- * @param tensor
- * @return true
- * @return false
- */
- bool is_src_tensor(Id tensor) const;
- /** Check if tensor is the dst tensor of the entire graph
- *
- * @param tensor
- * @return true
- * @return false
- */
- bool is_dst_tensor(Id tensor) const;
-
-private:
- Id insert_new_tensor();
- Id insert_new_op();
- bool tensor_exists(Id tensor) const;
- bool operator_exists(Id op) const;
- bool is_src_tensor_of(Id op, Id tensor) const;
- bool is_dst_tensor_of(Id op, Id tensor) const;
- bool are_connected(Id op, Id tensor) const;
-
-private:
- AdjList _adj_src_tensors{};
- AdjList _adj_dst_tensors{};
- AdjList _adj_src_ops{};
- AdjList _adj_dst_ops{};
- std::map<Id, Id> _merge_to_internal{}; // From merge tensor to internal tensor
- SerialIdAllocator _operator_id{};
- SerialIdAllocator _tensor_id{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_DEPENDENCYGRAPH_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/arm_compute/core/experimental/IWorkload.h b/arm_compute/core/experimental/IWorkload.h
deleted file mode 100644
index 54855c1084..0000000000
--- a/arm_compute/core/experimental/IWorkload.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
-
-#include "arm_compute/core/experimental/DependencyGraph.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Describes when a Unit Workload is run.
- *
- */
-struct UnitWorkloadStage
-{
- enum class Stage
- {
- Prepare, /**< Only run once at the beginning. */
- Run, /**< Run every time after the first time. */
- };
- Stage stage;
- friend bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
- {
- return stage0.stage == stage1.stage;
- }
-};
-/** Type of memory used by a Workload Tensor
- *
- */
-enum class MemoryType
-{
- Core = 0, /**< Core memory used by the Workload Tensor, e.g. for argument tensors */
- Auxiliary = 1, /**< Auxiliary memory required by the Workload Tensor, e.g. for temporary tensors */
-};
-
-using AuxMemoryLifetime = MemoryLifetime;
-
-/** Memory Info for a @ref WorkloadTensor of Auxiliary memory type. This communicates to the user how much additional
- * memory is required for auxiliary tensors
- */
-struct AuxMemoryInfo
-{
- AuxMemoryInfo() = default;
-
- AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept
- : size(size),
- alignment(alignment)
- {
- }
-
- AuxMemoryInfo(AuxMemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
- : lifetime(lifetime),
- size(size),
- alignment(alignment)
- {
- }
- friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
- {
- return info0.lifetime == info1.lifetime && info0.size == info1.size && info0.alignment == info1.alignment;
- }
-
- AuxMemoryLifetime lifetime{ AuxMemoryLifetime::Temporary }; /**< Memory lifetime*/
- size_t size{ 0 }; /**< Total memory size in bytes */
- size_t alignment{ 64 }; /**< Memory alignment in bytes */
-};
-
-/** A descriptor for IWorkload Tensors.
- */
-struct WorkloadTensor
-{
- using Id = DependencyGraph::Id;
- Id id{}; /**< Id of the workload tensor */
- ITensorInfo *info{}; /**< TensorInfo associated with the workload tensor */
- MemoryType memory_type{}; /**< Memory type */
- AuxMemoryInfo memory_info{}; /**< Auxiliary memory information. This can be ignored if the memory type is Core */
-};
-/** The basic atomic unit in an @ref IWorkload. It contains exactly one kernel to run.
- *
- */
-struct UnitWorkload
-{
- using Id = DependencyGraph::Id;
- Id id{}; /**< Id of the unit workload */
- UnitWorkloadStage stage{}; /**< Stage */
-};
-
-/** Run-time-agnostic, platform-specific graph that describes everything required to run a workload
- * It can be configured into an Arm Compute Library runtime, integrated into the runtime of another framework, or integrated into the compilation flow
- */
-struct IWorkload
-{
- using UnitWorkId = UnitWorkload::Id;
- using Tid = WorkloadTensor::Id;
- IWorkload() = default;
- virtual ~IWorkload() = default;
- DependencyGraph graph{}; /**< Dependency graph of the workload tensors and the unit workloads */
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IWORKLOAD_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/arm_compute/core/experimental/OperatorGraph.h b/arm_compute/core/experimental/OperatorGraph.h
deleted file mode 100644
index f40ad0d8c5..0000000000
--- a/arm_compute/core/experimental/OperatorGraph.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensorInfo.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Graph of operators to execute within a Workload. This is a pure descriptive construct.
- */
-class OperatorGraph final
-{
-public:
- struct Implementation;
- OperatorGraph();
- ~OperatorGraph();
-
-public:
- Implementation *impl();
- const Implementation *impl() const;
-
-private:
- std::unique_ptr<Implementation> _impl;
-};
-
-/** Return the validity of @p op_graph, usually after performing an operation (e.g. add_tensor) on it
- *
- * @param[in,out] op_graph OperatorGraph to be validated
- *
- * @return Status
- */
-Status validate(const OperatorGraph &op_graph);
-
-/** Operator Tensor Handle
- * This can be either an argument tensor, or an intermediate tensor linking 2 @ref Operator s
- */
-class OpTensor final
-{
-public:
- using Id = int;
- OpTensor(Id id = {});
- /** Id of the OpTensor
- * @return Id
- */
- Id id() const;
-
-private:
- Id _id{};
-};
-
-/** Provide order of @ref OpTensor by checking if @p t0 is "lower than" @p t1
- *
- * @param[in] t0 OpTensor
- * @param[in] t1 OpTensor
- *
- * @return true if @p t0 is lower than @p t1
- * @return false otherwise
- */
-bool operator<(const OpTensor &t0, const OpTensor &t1);
-
-/** Associate a TensorInfo with a newly created @ref OpTensor in the @p graph.
- *
- * @note @p info needs to remain in scope and valid until the workload has finished building
- * @note Can pass in an empty TensorInfo for a destination Tensor, in which case @p info will be inferred from the source tensors
- *
- * @param[in,out] graph OperatorGraph where the tensor is added
- * @param[in] info TensorInfo to be associated
- *
- * @return OpTensor
- */
-OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info);
-
-/** Operator Handle
- * This can be used to further modify an existing operator
- */
-class Operator final
-{
-public:
- using Id = int;
- Operator(Id id = {});
- /** Id of the Operator
- * @return Id
- */
- Id id() const;
-
-private:
- Id _id{};
-};
-
-/** Provide order of @ref Operator by checking if @p op0 is "lower than" @p op1
- *
- * @param[in] op0 Operator
- * @param[in] op1 Operator
- *
- * @return true if @p op0 is lower than @p op1
- * @return false otherwise
- */
-bool operator<(const Operator &op0, const Operator &op1);
-
-/** Descriptor for Conv2dDescriptor operation
- */
-struct Conv2dDescriptor
-{
- /* TOSA compliant attribute parameters start */
- Padding2D pad{};
- Size2D stride{ 1U, 1U };
- Size2D dilation{ 1U, 1U };
- /* TOSA compliant attribute parameters end */
- /* Non-TOSA compliant attribute parameters start */
- /* Non-TOSA compliant attribute parameters end */
-};
-/** Add op Conv2d to @p graph
- *
- * @param[in,out] graph OperatorGraph where the operator is added to
- * @param[in] desc Operator descriptor
- * @param[in] input Input OpTensor
- * @param[in] weights Weights OpTensor
- * @param[in] bias (Optional) bias OpTensor
- * @param[in] dst Destination OpTensor
- *
- * @return Operator
- */
-Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst);
-Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst);
-/** (Only for Debuging and Testing) Force a conv2d method
- *
- * @param[in,out] graph OperatorGraph where conv2d op is located
- * @param[in] conv2d Conv2d Op
- * @param[in] method Forced ConvolutionMethod
- */
-void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method);
-
-/** Descriptor for Elementwise binary operation
- *
- */
-struct ElementwiseDescriptor
-{
- /* TOSA compliant attribute parameters start */
- /* TOSA compliant attribute parameters end */
- /* Non-TOSA compliant attribute parameters start */
- ArithmeticOperation op;
- /* Non-TOSA compliant attribute parameters end */
-};
-/** Add op Elementwise to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
- *
- * @param[in,out] graph OperatorGraph where the operator is added to
- * @param[in] desc Operator descriptor
- * @param[in] lhs Lhs OpTensor
- * @param[in] rhs Rhs OpTensor
- * @param[in] dst Destination OpTensor
- *
- * @return Operator
- */
-Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst);
-
-/** Descriptor for Floor operation
- *
- */
-struct FloorDescriptor
-{
- /* TOSA compliant attribute parameters start */
- /* TOSA compliant attribute parameters end */
- /* Non-TOSA compliant attribute parameters start */
- /* Non-TOSA compliant attribute parameters end */
-};
-/** Add op Floor to @p graph, and optionally describes fusion through passing of intermediate @ref OpTensor s
- *
- * @param[in,out] graph OperatorGraph where the operator is added to
- * @param[in] desc Operator descriptor
- * @param[in] src Source OpTensor
- * @param[in] dst Destination OpTensor
- *
- * @return Operator
- */
-Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst);
-
-bool operator==(const OpTensor &t0, const OpTensor &t1);
-bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1);
-bool operator==(const ElementwiseDescriptor &, const ElementwiseDescriptor &);
-bool operator==(const FloorDescriptor &, const FloorDescriptor &);
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPH
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 3919635d1b..3030239270 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -35,19 +35,6 @@
#include "arm_compute/runtime/CL/CLTypes.h"
#include "arm_compute/runtime/CL/ICLTuner.h"
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClExecutionDescriptor;
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
namespace arm_compute
{
class ICLKernel;
@@ -108,20 +95,6 @@ public:
* @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
*/
void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush = true);
-
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
- /** Schedule the execution of the passed kernel if possible.
- *
- * @param[in] kernel Kernel to execute.
- * @param[in] tensors Map containing the tensors to operate on.
- * @param[in] exec_desc Execution descriptor
- * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
- */
- void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush = true);
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
/** Initialises the context and command queue to be used by the scheduler.
*
* @param[in] context A CL context.
@@ -214,10 +187,6 @@ private:
*/
void flush_queue(bool flush);
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
- void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush);
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
/** Flag to ensure symbols initialisation is happening before Scheduler creation */
static std::once_flag _initialize_symbols;
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 88933fc2d8..93aa45adc1 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -124,10 +124,6 @@ public:
void tune_kernel_static(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel) override;
void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
- void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) override;
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
/** Is the kernel_event set ?
*
* @return true if the kernel_event is set.
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index e0ee3ffe71..fa7a1424b8 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -30,15 +30,6 @@ namespace arm_compute
{
class ICLKernel;
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClExecutionDescriptor;
-} // namespace dynamic_fusion
-} // namespace experimental
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Basic interface for tuning the OpenCL kernels */
class ICLTuner
{
@@ -66,15 +57,6 @@ public:
* @param[in, out] tensors Tensors for the kernel to use
*/
virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) = 0;
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
- /** Tune OpenCL kernel dynamically for dynamic fusion interface
- *
- * @param[in] kernel Kernel to tune
- * @param[in, out] tensors Tensors for the kernel to use
- * @param[in] exec_desc Execution descriptor
- */
- virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc) = 0;
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_ICLTUNER_H */
diff --git a/arm_compute/runtime/experimental/ClCompositeOperator.h b/arm_compute/runtime/experimental/ClCompositeOperator.h
deleted file mode 100644
index 827629bd82..0000000000
--- a/arm_compute/runtime/experimental/ClCompositeOperator.h
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IOperator.h"
-
-#include "arm_compute/core/experimental/ClWorkload.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** Map OpTensor handles to their corresponding ITensor memory
- */
-using OpTensorBinding = std::map<OpTensor, ITensor *>;
-
-/** Map a kernel (as identified by its unit workload id) to its corresponding tensor pack
- *
- * @note External user should not use the add_tensor_pack method to alter this tensor pack map, and should only use the map returned by @ref bind_tensors
- */
-class TensorPackMap
-{
-public:
- /** Find a tensor pack associated with the unit workload Id @p uwk_id
- *
- * @param[in] uwk_id unit workload Id associated with the tensor pack
- *
- * @return ITensorPack*
- */
- ITensorPack *find_tensor_pack(UnitWorkload::Id uwk_id);
- /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.
- *
- * @param[in] uwk_id unit workload Id associated with the tensor pack
- *
- * @return ITensorPack*
- */
- ITensorPack &get_tensor_pack(UnitWorkload::Id uwk_id);
- /** Add a tensor pack and associate it with unit workload Id @p uwk_id
- * @note Should not be used by external user
- *
- * @param[in] uwk_id unit workload Id associated with the tensor pack
- * @param[in] tensor_pack Tensor Pack to be added
- */
- void add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack);
-
-private:
- std::map<UnitWorkload::Id, ITensorPack> _tensor_packs{};
-};
-
-/** Holder of any auxiliary CLTensors required by a ClWorkload.
- *
- * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the TensorInfo and AuxMemoryInfo
- *
- * @note This data holder must remain valid until the ClCompositeOperator that it's passed to is out of scope
- *
- * @note External user should not use the add_aux_tensor method, and should only use the data returned by @ref bind_tensors
- */
-class ClAuxTensorData
-{
-public:
- /** A view of a single auxiliary data and the associated TensorInfo and AuxMemoryInfo
- */
- struct DataView
- {
- DataView() = default;
- DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
- : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
- {
- }
- ~DataView() = default;
- DataView(const DataView &other) = default;
- DataView &operator=(const DataView &other) = default;
- DataView(DataView &&other) = default;
- DataView &operator=(DataView &&other) = default;
- CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */
- TensorInfo tensor_info{}; /**< Associated TensorInfo */
- AuxMemoryInfo memory_info{}; /**< Memory requirement */
- };
-
- /** Add auxiliary tensor.
- *
- * @note Should not be used by external user
- *
- * @param[in] tensor_id Any Id that can uniquely identify an auxiliary tensor. Usually ClWorkloadTensor Id
- * @param[in] tensor_info TensorInfo associated with the tensor
- * @param[in] memory_info Memory requirements
- *
- * @return CLTensor* if successfully added, otherwise nullptr
- */
- CLTensor *add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info);
-
- /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors.
- *
- * @return std::vector<DataView>&
- */
- std::vector<DataView> &get_tensors();
-
-private:
- std::map<int, std::unique_ptr<CLTensor>> _owned_tensors{};
- std::vector<DataView> _tensors{};
-};
-
-/** Bind tensor memory to packs used by prepare and run methods. Create auxiliary tensor objects and their memory requirements if needed
- *
- * @note This is the only method for external user to create ClAuxTensorData, and the prepare and run TensorPackMaps
- *
- * @param[out] aux_tensor_data Auxiliary Tensors required by the workload
- * @param[out] prepare_pack_map TensorPackMap used by the prepare method
- * @param[out] run_pack_map TensorPackMap used by the run method
- * @param[in] workload ClWorkload to bind the tensors to
- * @param[in] op_tensors CLTensor memory objects mapped from Core OpTensors
- *
- * @return Status
- */
-Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors);
-
-/** Operator runtime to run a @ref ClWorkload
- *
- * @note User must explicitly call prepare before run otherwise run will fail.
- *
- */
-class ClCompositeOperator
-{
-public:
- ClCompositeOperator();
- ~ClCompositeOperator();
- /** Configures a @ref ClCompositeOperator with a @ref ClWorkload
- * This includes the compilation of Cl kernels inside the @ref ClWorkload
- *
- * @param[in] ctx CLCompileContext
- * @param[in] workload ClWorkload to configure with
- */
- void configure(const CLCompileContext &ctx, const ClWorkload &workload);
- /** Validate ClWorkload @p workload
- *
- * @param[in] workload ClWorkload to be validated
- *
- * @return Status
- */
- static Status validate(const ClWorkload &workload);
- /** Enqueue prepare workloads
- *
- * @param tensor_pack_map Tensors required by the prepare workloads
- */
- void prepare(TensorPackMap &tensor_pack_map);
- /** Enqueue run workloads
- *
- * @param tensor_pack_map Tensors required by the run workloads
- */
- void run(TensorPackMap &tensor_pack_map);
-
-private:
- struct Implementation;
- std::unique_ptr<Implementation> _impl;
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMIC_FUSION_CLCOMPOSITEOPERATOR_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/docs/Doxyfile b/docs/Doxyfile
index da637abd3e..641ca4f57f 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -2097,8 +2097,7 @@ PREDEFINED = DOXYGEN_SKIP_THIS \
LOCATE_MIN \
LOCATE_MAX \
HAS_BIAS \
- POOL_AVG \
- ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+ POOL_AVG
# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
deleted file mode 100644
index afbc55777b..0000000000
--- a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/// @example dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
-/// @copybrief example_dynamic_fusion_cl_conv2d_elementwise_add
-///
-/// @page example_dynamic_fusion_cl_conv2d_elementwise_add Dynamic Fusion Example: Conv2d + Elementwise Addition (OpenCL target)
-/// This example demonstrates how to fuse a Conv2d with an Addition using the new OperatorGraph API, and to run it with the Async Composite Operator
-
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
-#error "This example needs to be built with -DARM_COMPUTE_CL"
-#endif /* ARM_COMPUTE_CL */
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "arm_compute/core/experimental/OperatorGraph.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "utils/TypePrinter.h"
-
-#include "utils/Utils.h"
-
-#include <cstdlib>
-
-using namespace arm_compute;
-using namespace utils;
-using namespace arm_compute::experimental::dynamic_fusion;
-
-#define TICK(clock_name) \
- auto clock_name##_tick = std::chrono::high_resolution_clock::now();
-#define TOCK(clock_name, measurement_map) \
- auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
- measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
-#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
- auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
- measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
-
-using std::chrono::duration_cast;
-using std::chrono::microseconds;
-
-class ClFusedConv2dEltwiseAddExample : public Example
-{
-public:
- bool do_setup(int argc, char **argv) override
- {
- size_t ih;
- size_t iw;
- size_t ifm;
- size_t wh;
- size_t ww;
- size_t ofm;
- size_t tuner_choice;
- unsigned int pad_x;
- unsigned int pad_y;
- if(argc < 10)
- {
- // Print help
- std::cout << "Usage: ./cl_fused_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n";
- std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n";
- ih = 512;
- iw = 512;
- ifm = 64;
- wh = 1;
- ww = 1;
- ofm = 3;
- tuner_choice = 2;
- pad_x = 0;
- pad_y = 0;
- }
- else
- {
- ih = strtol(argv[1], nullptr, 10);
- iw = strtol(argv[2], nullptr, 10);
- ifm = strtol(argv[3], nullptr, 10);
- wh = strtol(argv[4], nullptr, 10);
- ww = strtol(argv[5], nullptr, 10);
- ofm = strtol(argv[6], nullptr, 10);
- tuner_choice = strtol(argv[7], nullptr, 10);
- pad_x = strtol(argv[8], nullptr, 10);
- pad_y = strtol(argv[9], nullptr, 10);
- }
-
- CLTuner *tuner_to_use;
- switch(tuner_choice)
- {
- case 0:
- {
- tuner_to_use = nullptr;
- break;
- }
- case 1:
- {
- tuner.set_tuner_mode(CLTunerMode::RAPID);
- tuner_to_use = &tuner;
- break;
- }
- case 3:
- {
- tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE);
- tuner_to_use = &tuner;
- break;
- }
- case 2:
- default:
- {
- tuner.set_tuner_mode(CLTunerMode::NORMAL);
- tuner_to_use = &tuner;
- break;
- }
- }
- CLScheduler::get().default_init(tuner_to_use);
-
- TICK(startup_time);
- TICK(configure);
- /* Computation:
- * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
-
- const auto t_input_shape = TensorShape(ifm, iw, ih);
- const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm);
- const auto t_bias_shape = TensorShape(ofm);
- const auto t_l1_addend_shape = TensorShape(ofm, iw);
-
- std::cout << "input_shape: " << t_input_shape << std::endl;
- std::cout << "weight_shape: " << t_weight_shape << std::endl;
- std::cout << "bias_shape: " << t_bias_shape << std::endl;
- std::cout << "addend_shape: " << t_l1_addend_shape << std::endl;
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// @section describe_workload_using_operator_graph Describe the workload to run using OperatorGraph
- /// OperatorGraph is a graph of Tensors and Operators. Let's first default-construct it
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct OperatorGraph
- // [Construct OperatorGraph]
- OperatorGraph op_graph;
- // [Construct OperatorGraph]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// @subsection add_conv2d Add the first operator (root operator) Conv2d
- /// The first operator to be added to the graph is called the "root operator" of the entire graph.
- /// @note As of now, operators need to be inserted according to their dependency order. This is because output tensor auto-initialization occurs during construction time.
- /// Later this might be changed to allow out-of-order insertion.
-
- /// Before we insert the operator, we need to initialize the required TensorInfo objects.
- /// We can choose not to initialize an output TensorInfo; if so, they will be auto-initialized during the construction of the OperatorGraph
- /// The "t_acc_info" is the TensorInfo of the accumulator tensor, which is the output tensor of our first operator conv2d
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize Conv2d TensorInfo
- // [Initialize Conv2d TensorInfo]
- auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
- auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
- auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout);
- auto t_acc_info = TensorInfo();
- // [Initialize Conv2d TensorInfo]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// Next we associate the TensorInfo with the OpTensor s created in the op_graph.
- /// @note The associated TensorInfo objects must be in scope and remain valid until the ClWorkload building is completed
-
- /// @note The associated TensorInfo objects must be declard as non-const, since they may be updated during the OperatorGraph construction
-
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add OpTensors
- // [Add OpTensors]
- const auto op_t_input = add_tensor(op_graph, t_input_info);
- const auto op_t_weight = add_tensor(op_graph, t_weight_info);
- const auto op_t_bias = add_tensor(op_graph, t_bias_info);
- const auto op_t_acc = add_tensor(op_graph, t_acc_info);
- // [Add OpTensors]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// Finally we add the Conv2d operator to op_graph. The Conv2dDescriptor contains all the TOSA-compliant attribute parameters
- /// The add_op... group of functions accept the OpTensors created by the add_tensor function, and return an Operator handle.
- /// This handle can be used to further query and modify the operator inside the OperatorGraph after its creation
- /// For example, here we use the handle to force the ConvolutionMethod to be Direct Convolution
- /// @note The force_conv2d_method is only for debug purpose for now, as the end user is not expected to decide on the ConvolutionMethod
-
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Conv2d Operator
- // [Add Conv2d Operator]
- Conv2dDescriptor conv2d_desc{ Padding2D{ pad_x, pad_x, pad_y, pad_y } };
- auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_bias, op_t_acc);
- force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); // Only for debug purposes
- // [Add Conv2d Operator]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// @subsection add_elementwise_add Add the second operator Elementwise Add
- /// This is similar to adding the first operator to op_graph, except that we link the two operators together by their common tensor,
- /// namely the accumulator tensor op_t_acc, which is the output of conv2d and the input (lhs) of the addition
- /// @note At the moment, it is recommended to always declare a separate TensorInfo (even if empty) for each OpTensor.
- /// For example, here op_t_dst could be associated with op_t_acc info as they are the same,
- /// but we still recommend creating a separate object.
-
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Add Elementwise Add Operator
- // [Add Elementwise Add Operator]
- auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout);
- auto t_dst_info = TensorInfo();
- const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info);
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
- ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
- add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
- // [Add Elementwise Add Operator]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// @section build_clworkload Build ClWorkload
- /// ClWorkload is an intermediate object which contains all the built kernel codes and all other descriptors on how to schedule them
- /// We build ClWorkload from the op_graph object that we just described
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Build ClWorkload
- // [Build ClWorkload]
- const ClWorkloadContext workload_ctx
- {
- GpuInfo{ CLScheduler::get().target() }
- };
- ClWorkload workload;
- build(workload, op_graph, workload_ctx);
- // [Build ClWorkload]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// @section run_fused_op_with_clcompositeoperator Run the fused operator workload with ClCompositeOperator
- /// @subsection configure_and_validate_clcompositeoperator Validate ClWorkload and Configure ClCompositeOperator
- /// After ClWorkload is built, we need to configure it with the Compute Library runtime ClCompositeOperator to run it.
- /// Optionally we can explicitly validate the workload to check if the workload has been built successfully.
- /// The validate is automatically run inside configure and would throw if it fails.
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClCompositeOperator
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Validate and configure ClCompositeOperator
- // [Validate and configure ClCompositeOperator]
- const auto success = ClCompositeOperator::validate(workload); // Optional
- op.configure(CLKernelLibrary::get().get_compile_context(), workload);
- // [Validate and configure ClCompositeOperator]
- TOCK(configure, measurements);
-
- TICK(tensor_allocation);
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// @subsection run_clcompositeoperator Run ClCompositeOperator
- /// Construct the runtime CLTensor s with backing memory
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct CLTensor objects
-
- /// Initialize, allocate and fill the CLTensor objects
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize, Allocate and Fill CLTensor objects
- // [Initialize, Allocate and Fill CLTensor objects]
- t_input.allocator()->init(t_input_info);
- t_weight.allocator()->init(t_weight_info);
- t_bias.allocator()->init(t_bias_info);
- t_l1_addend.allocator()->init(t_dst_info);
- t_dst.allocator()->init(t_dst_info);
-
- t_input.allocator()->allocate();
- t_weight.allocator()->allocate();
- t_bias.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- t_dst.allocator()->allocate();
-
- fill_random_tensor(t_input, -1.f, 1.f);
- fill_random_tensor(t_weight, -1.f, 1.f);
- fill_random_tensor(t_l1_addend, -1.f, 1.f);
- // [Initialize, Allocate and Fill CLTensor objects]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// The OpTensorBinding creates a mapping from the OpTensor handles that we created early to the real CLTensors
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Create OpTensorBinding
- // [Create OpTensorBinding]
- OpTensorBinding op_tensors({ { op_t_input, &t_input },
- { op_t_weight, &t_weight },
- { op_t_bias, &t_bias },
- { op_t_l1_addend, &t_l1_addend },
- { op_t_dst, &t_dst }
- });
- // [Create OpTensorBinding]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// Bind the CLTensor objects to the prepare_pack_map and run_pack_map, which are used to prepare and run the op
- /// This step additionally creates empty auxiliary CLTensor objects if any, and contain them inside a ClAuxTensorData aux_tensor_data
- /// @note This step associates all the CLTensors contained in op_tensors and aux_tensor_data, with prepare_pack_map and run_pack_map
- /// Make sure these CLTensors remain valid as long as the two pack_maps are still in use
-
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClAuxTensorData
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct TensorPackMaps
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Bind Tensors
- // [Bind Tensors]
- bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, op_tensors);
- // [Bind Tensors]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// Initialize and Allocate Auxiliary CLTensor objects.
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize and Allocate Auxiliary CLTensor objects
- // [Initialize and Allocate Auxiliary CLTensor objects]
- for(auto tensor_data : aux_tensor_data.get_tensors())
- {
- tensor_data.tensor->allocator()->init(tensor_data.tensor_info);
- tensor_data.tensor->allocator()->allocate();
- }
- // [Initialize and Allocate Auxiliary CLTensor objects]
- TOCK(tensor_allocation, measurements);
-
- TICK(dummy_run);
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like
- /// reshaping tensors for a more performant format.
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Prepare ClCompositeOperator
- // [Prepare ClCompositeOperator]
- op.prepare(prepare_pack_map);
- // [Prepare ClCompositeOperator]
-
- /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
- /// At last, we run our operator
- /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Run ClCompositeOperator
- // [Run ClCompositeOperator]
- op.run(run_pack_map);
- // [Run ClCompositeOperator]
- CLScheduler::get().sync();
- TOCK(dummy_run, measurements);
- TOCK(startup_time, measurements);
- return true;
- }
- void do_run() override
- {
- // Run the fused op
- op.run(run_pack_map);
-
- // Make sure all the OpenCL jobs are done executing:
- CLScheduler::get().sync();
- }
-
- void do_teardown() override
- {
- for(const auto &m : measurements)
- {
- std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
- }
- }
-
-private:
- // [Construct CLTensor objects]
- CLTensor t_input{};
- CLTensor t_weight{};
- CLTensor t_bias{};
- CLTensor t_l1_addend{};
- CLTensor t_dst{};
- // [Construct CLTensor objects]
- // [Construct ClAuxTensorData]
- ClAuxTensorData aux_tensor_data{};
- // [Construct ClAuxTensorData]
- // [Construct TensorPackMaps]
- TensorPackMap prepare_pack_map{};
- TensorPackMap run_pack_map{};
- // [Construct TensorPackMaps]
- // [Construct ClCompositeOperator]
- ClCompositeOperator op{};
- // [Construct ClCompositeOperator]
- CLTuner tuner{};
- std::map<std::string, std::chrono::microseconds> measurements{};
-};
-
-/** Main program for sgemm test
- *
- * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta )
- */
-int main(int argc, char **argv)
-{
- return utils::run_example<ClFusedConv2dEltwiseAddExample>(argc, argv);
-}
-
-#undef TICK
-#undef TOCK
-#undef TOCK_AVG
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
diff --git a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp
deleted file mode 100644
index 3aedcc0f41..0000000000
--- a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
-#error "This example needs to be built with -DARM_COMPUTE_CL"
-#endif /* ARM_COMPUTE_CL */
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "utils/TypePrinter.h"
-#include "utils/Utils.h"
-
-#include <cstdlib>
-
-using namespace arm_compute;
-using namespace utils;
-
-#define TICK(clock_name) \
- auto clock_name##_tick = std::chrono::high_resolution_clock::now();
-#define TOCK(clock_name, measurement_map) \
- auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
- measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
-#define TOCK_AVG(clock_name, measurement_map, num_iterations) \
- auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \
- measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
-
-using std::chrono::duration_cast;
-using std::chrono::microseconds;
-/** A reference for comparing against the fusion of a direct convolution with an elementwise addition:
- * examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
- */
-class ClRefConv2dEltwiseAddExample : public Example
-{
-public:
- bool do_setup(int argc, char **argv) override
- {
- size_t ih;
- size_t iw;
- size_t ifm;
- size_t wh;
- size_t ww;
- size_t ofm;
- size_t tuner_choice;
- unsigned int pad_x;
- unsigned int pad_y;
- if(argc < 10)
- {
- // Print help
- std::cout << "Usage: ./cl_ref_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n";
- std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n";
- ih = 512;
- iw = 512;
- ifm = 64;
- wh = 1;
- ww = 1;
- ofm = 3;
- tuner_choice = 2;
- pad_x = 0;
- pad_y = 0;
- }
- else
- {
- ih = strtol(argv[1], nullptr, 10);
- iw = strtol(argv[2], nullptr, 10);
- ifm = strtol(argv[3], nullptr, 10);
- wh = strtol(argv[4], nullptr, 10);
- ww = strtol(argv[5], nullptr, 10);
- ofm = strtol(argv[6], nullptr, 10);
- tuner_choice = strtol(argv[7], nullptr, 10);
- pad_x = strtol(argv[8], nullptr, 10);
- pad_y = strtol(argv[9], nullptr, 10);
- }
-
- CLTuner *tuner_to_use;
- switch(tuner_choice)
- {
- case 0:
- {
- tuner_to_use = nullptr;
- break;
- }
- case 1:
- {
- tuner.set_tuner_mode(CLTunerMode::RAPID);
- tuner_to_use = &tuner;
- break;
- }
- case 3:
- {
- tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE);
- tuner_to_use = &tuner;
- break;
- }
- case 2:
- default:
- {
- tuner.set_tuner_mode(CLTunerMode::NORMAL);
- tuner_to_use = &tuner;
- break;
- }
- }
-
- CLScheduler::get().default_init(tuner_to_use);
-
- TICK(startup_time);
- TICK(configure);
-
- /* Computation:
- * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const PadStrideInfo conv_info{ 1, 1, pad_x, pad_y };
- const auto t_input_shape = TensorShape(ifm, iw, ih);
- const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm);
- const auto t_bias_shape = TensorShape(ofm);
- const auto t_l1_addend_shape = TensorShape(ofm, iw);
- const auto t_dst_shape = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info);
- std::cout << "input_shape: " << t_input_shape << std::endl;
- std::cout << "weight_shape: " << t_weight_shape << std::endl;
- std::cout << "bias_shape: " << t_bias_shape << std::endl;
- std::cout << "addend_shape: " << t_l1_addend_shape << std::endl;
- std::cout << "dst_shape: " << t_dst_shape << std::endl;
- auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
- auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
- auto t_bias_info = TensorInfo(t_bias_shape, 1, data_type, data_layout);
- auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Intermediate tensor for cond3
- auto t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout);
- auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
-
- // Init tensors
- {
- t_input.allocator()->init(t_input_info);
- t_weight.allocator()->init(t_weight_info);
- t_bias.allocator()->init(t_bias_info);
- t_l1_addend.allocator()->init(t_dst_info);
- t_l0_dst.allocator()->init(t_l0_dst_info);
- t_dst.allocator()->init(t_dst_info);
- }
-
- op0.configure(&t_input, &t_weight, &t_bias, &t_l0_dst, conv_info);
- op1.configure(&t_l0_dst, &t_l1_addend, &t_dst, ConvertPolicy{});
- TOCK(configure, measurements);
-
- TICK(tensor_allocation);
- // Construct tensors
- // Allocate and fill tensors
- {
- t_input.allocator()->allocate();
- t_weight.allocator()->allocate();
- t_bias.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- t_l0_dst.allocator()->allocate();
- t_dst.allocator()->allocate();
- fill_random_tensor(t_input, -1.f, 1.f);
- fill_random_tensor(t_weight, -1.f, 1.f);
- fill_random_tensor(t_bias, -1.f, 1.f);
- fill_random_tensor(t_l1_addend, -1.f, 1.f);
- }
- TOCK(tensor_allocation, measurements);
- // Dummy run for CLTuner
- TICK(dummy_run);
- op0.run();
- CLScheduler::get().sync();
- TOCK(dummy_run, measurements);
- TOCK(startup_time, measurements);
- return true;
- }
- void do_run() override
- {
- // Run the ops
- op0.run();
- op1.run();
-
- // Make sure all the OpenCL jobs are done executing:
- CLScheduler::get().sync();
- }
-
- void do_teardown() override
- {
- for(auto m : measurements)
- {
- std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
- }
- }
-
-private:
- CLTensor t_input{};
- CLTensor t_weight{};
- CLTensor t_bias{};
- CLTensor t_l1_addend{};
- CLTensor t_l0_dst{};
- CLTensor t_dst{};
- CLDirectConvolutionLayer op0{};
- CLArithmeticAddition op1{};
- CLTuner tuner{};
- std::map<std::string, std::chrono::microseconds> measurements{};
-};
-
-/** Main program for sgemm test
- *
- * @param[in] argc Number of arguments
- * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta )
- */
-int main(int argc, char **argv)
-{
- return utils::run_example<ClRefConv2dEltwiseAddExample>(argc, argv);
-}
-
-#undef TICK
-#undef TOCK
-#undef TOCK_AVG \ No newline at end of file
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 0a0de84bab..d718af0f21 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -15,7 +15,6 @@ def get_list_flags( filename, arch):
flags = ["-std=c++14"]
flags.append("-DARM_COMPUTE_CPP_SCHEDULER=1")
flags.append("-DARM_COMPUTE_CL")
- flags.append("-DENABLE_EXPERIMENTAL_DYNAMIC_FUSION")
if arch == "aarch64":
flags.append("-DARM_COMPUTE_AARCH64_V8_2")
return flags
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 224b68af70..5d5b636cf4 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -37,19 +37,6 @@
#include <string>
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClExecutionDescriptor;
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
namespace arm_compute
{
namespace
@@ -345,14 +332,6 @@ public:
{
ARM_COMPUTE_UNUSED(tensors, window, queue);
}
-
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
- /// The execution is carried out through run_op method. But the run_op method needs to be extended to include ClExecutionDescriptor as now LWS GWS tuning will be separated from the IKernel
- virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
- {
- ARM_COMPUTE_UNUSED(tensors, window, queue, exec_desc);
- }
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
/** Add the passed parameters to the object's kernel's arguments starting from the index idx.
*
* @param[in,out] idx Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
deleted file mode 100644
index 9b6daae619..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ClKernelBlueprint::ClKernelBlueprint()
- : _impl{ std::make_unique<ClKernelBlueprint::Implementation>() }
-{
-}
-
-ClKernelBlueprint::~ClKernelBlueprint() = default;
-
-ClKernelBlueprint::Implementation &ClKernelBlueprint::impl()
-{
- return *_impl;
-}
-const ClKernelBlueprint::Implementation &ClKernelBlueprint::impl() const
-{
- return *_impl;
-}
-
-Status add_tensor(ClKernelBlueprint &kernel_blueprint, ITensorInfo *tensor_info, ArgumentID &id, ArgumentID merge_point)
-{
- id = kernel_blueprint.impl().add_kernel_tensor(tensor_info, merge_point);
- return Status{};
-}
-
-Status add_kcomp_eltwise_op(ClKernelBlueprint &kernel_blueprint, const ClElementwiseKernelDescriptor &desc,
- ArgumentID src0_id, ArgumentID src1_id, ArgumentID &dst_id)
-{
- kernel_blueprint.impl().add_component(
- std::make_unique<ClElementwiseKernelComponent>(
- &kernel_blueprint,
- desc,
- SharedVarLink{ src0_id, SharedVarIO::Input },
- SharedVarLink{ src1_id, SharedVarIO::Input },
- SharedVarLink{ dst_id, SharedVarIO::Output }));
-
- return Status{};
-}
-
-Status add_kcomp_floor(ClKernelBlueprint &kernel_blueprint, const ClFloorKernelDescriptor &,
- ArgumentID src_id, ArgumentID &dst_id)
-{
- kernel_blueprint.impl().add_component(
- std::make_unique<ClFloorKernelComponent>(
- &kernel_blueprint,
- SharedVarLink{ src_id, SharedVarIO::Input },
- SharedVarLink{ dst_id, SharedVarIO::Output }));
-
- return Status{};
-}
-
-Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID, ArgumentID &)
-{
- return Status{};
-}
-
-Status add_kcomp_direct_conv2d(ClKernelBlueprint &kernel_blueprint,
- const ClDirectConv2dKernelDescriptor &direct_conv2d_desc,
- ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id)
-{
- kernel_blueprint.impl().add_component(
- std::make_unique<ClDirectConvolutionKernelComponent>(
- &kernel_blueprint,
- direct_conv2d_desc,
- SharedVarLink{ src_id, SharedVarIO::Input },
- SharedVarLink{ weight_id, SharedVarIO::Input },
- SharedVarLink{ dst_id, SharedVarIO::Output },
- SharedVarLink{ bias_id, SharedVarIO::Input }));
-
- return Status{};
-}
-
-Status add_kcomp_store(ClKernelBlueprint &kernel_blueprint, const StoreType &store_type, ArgumentID src_tile, ArgumentID dst_tile)
-{
- switch(store_type)
- {
- case StoreType::StoreBlockBoundaryAware:
- kernel_blueprint.impl().add_component(
- std::make_unique<ClStoreBlockBoundaryAwareKernelComponent>(
- &kernel_blueprint,
- SharedVarLink{ src_tile, SharedVarIO::Input },
- SharedVarLink{ dst_tile, SharedVarIO::Output }));
- break;
- case StoreType::TStoreIndirectWidthSelect:
- kernel_blueprint.impl().add_component(
- std::make_unique<ClStoreIndirectWidthSelectKernelComponent>(
- &kernel_blueprint,
- SharedVarLink{ src_tile, SharedVarIO::Input },
- SharedVarLink{ dst_tile, SharedVarIO::Output }));
- break;
- default:
- ARM_COMPUTE_ERROR("Store mode not yet supported.");
- }
-
- return Status{};
-}
-
-Status update_merge_point(ClKernelBlueprint &bp, ArgumentID t_id, ArgumentID merge_point)
-{
- return bp.impl().update_merge_point(t_id, merge_point);
-}
-
-Status set_tile_info(ClKernelBlueprint &bp, const TileDescriptor &tile_info)
-{
- bp.impl().set_tile_info(tile_info);
- return Status{};
-}
-Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &kernel_blueprint)
-{
- kernel_blueprint.impl().finalize();
- code.name = kernel_blueprint.impl().build_kernel_name();
- code.code = kernel_blueprint.impl().build_code();
-
- code.config_id = kernel_blueprint.impl().build_config_id();
- code.build_options = kernel_blueprint.impl().build_options();
- code.window = kernel_blueprint.impl().get_execution_window();
- code.arguments = kernel_blueprint.impl().get_arguments();
-
- return Status{};
-}
-DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint)
-{
- return blueprint.impl().get_graph();
-}
-Status tune_static(ClExecutionDescriptor &, const ClKernelCode &)
-{
- return Status{};
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
deleted file mode 100644
index 463fc5e7cf..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
-#define ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "arm_compute/core/experimental/DependencyGraph.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-using ArgumentID = DependencyGraph::Id;
-
-static constexpr ArgumentID g_arg_placeholder = DependencyGraph::empty_id();
-
-/** Intermediate representation of the final, complete kernel source. */
-class ClKernelBlueprint
-{
-public:
- ClKernelBlueprint();
- ~ClKernelBlueprint();
-
-private:
- struct Implementation;
- std::unique_ptr<Implementation> _impl;
-
-public:
- Implementation &impl();
- const Implementation &impl() const;
-};
-
-///// Kernel Components /////
-/** Component: Eltwise Operator */
-Status add_kcomp_eltwise_op(ClKernelBlueprint &, const ClElementwiseKernelDescriptor &, ArgumentID src0_id,
- ArgumentID src1_id, ArgumentID &dst_id);
-
-/** Component: Floor */
-Status add_kcomp_floor(ClKernelBlueprint &, const ClFloorKernelDescriptor &, ArgumentID src_id,
- ArgumentID &dst_id);
-
-/** Component: Activation */
-Status add_kcomp_activation(ClKernelBlueprint &, const ClActivationKernelDescriptor &, ArgumentID src_id, ArgumentID &dst_id);
-
-/** Component: Direct Convolution **/
-Status add_kcomp_direct_conv2d(ClKernelBlueprint &, const ClDirectConv2dKernelDescriptor &,
- ArgumentID src_id, ArgumentID weight_id, ArgumentID bias_id, ArgumentID &dst_id);
-
-Status add_kcomp_store(ClKernelBlueprint &, const StoreType &store_type, ArgumentID src_id, ArgumentID dst_id);
-
-Status add_tensor(ClKernelBlueprint &, ITensorInfo *, ArgumentID &, ArgumentID merge_point = DependencyGraph::empty_id());
-
-///// Kernel Components /////
-
-///// Building /////
-
-/** Update existing merge tensor @p merge_point to point to @p t_id
- *
- * @param t_id
- * @param merge_point
- * @return Status
- */
-Status update_merge_point(ClKernelBlueprint &, ArgumentID t_id, ArgumentID merge_point);
-
-/** Get dependency graph
- *
- * @return DependencyGraph
- */
-DependencyGraph get_dependency_graph(const ClKernelBlueprint &blueprint);
-
-/** All information required for building the @ref ClKernelCode */
-struct ClCodeBuilderContext
-{
- GpuInfo gpu_info{};
-};
-
-Status set_tile_info(ClKernelBlueprint &, const TileDescriptor &);
-
-/** Build final kernel source from KernelBlueprint */
-Status build(ClKernelCode &code, const ClCodeBuilderContext &, ClKernelBlueprint &);
-
-///// Building /////
-
-///// Tuning /////
-
-Status tune_static(ClExecutionDescriptor &, const ClKernelCode &);
-
-///// Tuning /////
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_CLKERNELBUILDINGAPI_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
deleted file mode 100644
index 04919acb83..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h
+++ /dev/null
@@ -1,930 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
-
-#include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "src/core/common/Macros.h"
-#include "support/Requires.h"
-#include "support/StringSupport.h"
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-
-#include <iostream>
-#include <queue>
-#include <stack>
-#include <string>
-#include <unordered_set>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-/** We introduce the concept of *Shared Variables* in the context of kernel building.
- * They are variables that can be accessed / shared among all the kernel components within a single kernel.
- * For now we consider 2 groups of shared variables:
- * Argument: The argument variables (parameters) of a kernel
- * Automatic: The automatic variables declared inside a kernel
- * All Shared Variables have the same kernel scope, and are thus visible to all kernel components
-*/
-
-enum class SharedVarIO
-{
- Input,
- Output
-};
-
-enum class SharedVarGroup
-{
- Argument, // Parameters to a kernel function == dst or src tensors of the whole blueprint graph
- Automatic // Automatic variables declared within the kernel body == intermediate tensors of the whole blueprint graph
-};
-
-/** Specifies a shared variable link for a component.
- * It describes all the information that's available when a component is constructed / added:
- * e.g. its linkage (via ArgumentID and io) and its group
- * This is not shared variable on its own, but is used for instantiating a SharedVar when building the code
- */
-struct SharedVarLink
-{
- ArgumentID arg_id{ g_arg_placeholder };
- SharedVarIO io{ SharedVarIO::Input };
- bool is_empty() const
- {
- return arg_id == g_arg_placeholder;
- }
-};
-
-/** A table of all the variables used in the kernel / blueprint
- * Because we limit the DependencyGraph in the blueprint to a Linear Sequence for now, we only allow ** a single global variable (the accumulator) **
- *
- * NOTE: the order they appear in the table is the order of their "declaration" in the component code, and is also their ID
- * NOTE: the variables all have the scope of the full kernel function
- */
-class SharedVarTable
-{
-public:
- /** A fully realized SharedVarLink
- */
- struct SharedVar
- {
- ArgumentID arg_id{ g_arg_placeholder };
- SharedVarIO io{ SharedVarIO::Input };
- SharedVarGroup group{ SharedVarGroup::Argument };
- std::string uniq_name{}; // Unique name, also the final variable name used in the built code
- ClKernelArgDescriptor desc{}; // Automatic variables can and should still be described using this struct
- bool is_empty() const
- {
- return arg_id == g_arg_placeholder;
- }
- };
-
- class Arguments
- {
- public:
- Arguments() = default;
- void add_var(const SharedVar &var)
- {
- ARM_COMPUTE_ERROR_ON(var.group != SharedVarGroup::Argument);
- _vars.push_back(var);
- }
- std::vector<SharedVar> get_all_vars() const
- {
- return _vars;
- }
- std::vector<SharedVar> get_src_vars() const
- {
- std::vector<SharedVar> src_vars;
- std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(src_vars), [](const SharedVar & var)
- {
- return var.io == SharedVarIO::Input;
- });
- return src_vars;
- }
- SharedVar get_dst_var() const
- {
- std::vector<SharedVar> dst_vars;
- std::copy_if(_vars.begin(), _vars.end(), std::back_inserter(dst_vars), [](const SharedVar & var)
- {
- return var.io == SharedVarIO::Output;
- });
- ARM_COMPUTE_ERROR_ON(dst_vars.size() != 1);
- return dst_vars.at(0);
- }
-
- private:
- std::vector<SharedVar> _vars{};
- };
-
- /** Create a SharedVar for a corresponding SharedVarLink (contains ArgumentID). If one has already been created for the SharedVarLink, simply return it instead of creating a new one
- *
- * @note: The order of insertion is important. There is one precondition:
- * PRECOND: The components have been sorted topologically / is being traversed in topological order
- * This ensures that all the consumer var links (Output, Automatic Links) can consume (return) the producer var links when they're referred
- */
- void add(SharedVarLink var_link, SharedVarGroup group, ClKernelArgDescriptor runtime_desc, const std::string &name = "unnamed")
- {
- ARM_COMPUTE_ERROR_ON_MSG(var_link.is_empty(), "Non-empty SharedVarLink expected");
- if(!get(var_link).is_empty())
- {
- return;
- }
-
- auto var_id = _num_var;
- std::stringstream ss;
- ss << name << "_" << var_id;
- const auto uniq_name = ss.str();
- SharedVar var{ var_link.arg_id, var_link.io, group, uniq_name, runtime_desc };
-
- if(group == SharedVarGroup::Argument)
- {
- _arguments.emplace(var_id, var);
- _arg_id_map.emplace(var_link.arg_id, var_id);
- _num_var++;
- }
- else if(group == SharedVarGroup::Automatic)
- {
- if(_global_vars.empty())
- {
- if(var_link.io == SharedVarIO::Output)
- {
- _global_vars.emplace(var_id, var);
- _arg_id_map.emplace(var_link.arg_id, var_id);
- _num_var++;
- }
- else
- {
- ARM_COMPUTE_ERROR("Component likely not traversed in topological order");
- }
- }
- else
- {
- // Associate additional SharedVarLinks with the single global shared variable
- const auto global_var_id = _global_vars.begin()->first;
- _arg_id_map[var_link.arg_id] = global_var_id;
- }
- }
- else
- {
- ARM_COMPUTE_ERROR("Unrecognised SharedVarGroup");
- }
- }
-
- /** Get the SharedVar associated with @p var_link
- *
- * @param var_link
- * @return SharedVar
- */
- SharedVar get(const SharedVarLink &var_link) const
- {
- const SharedVar empty_var{};
- if(_arg_id_map.find(var_link.arg_id) != _arg_id_map.end())
- {
- const auto var_id = _arg_id_map.at(var_link.arg_id);
- const auto arg_var = _arguments.find(var_id);
- if(arg_var != _arguments.end())
- {
- return arg_var->second;
- }
- else
- {
- return _global_vars.at(var_id);
- }
- }
- return empty_var;
- }
-
- /** @note The arguments are returned in the order they are added
- */
- Arguments get_kernel_arguments() const
- {
- Arguments args{};
- for(const auto &a : _arguments)
- {
- args.add_var(a.second);
- }
- return args;
- }
-
-private:
- using VarID = int32_t;
-
-private:
- std::map<VarID, SharedVar> _global_vars{}; // Shared, global variable
- std::map<VarID, SharedVar> _arguments{};
- std::map<ArgumentID, VarID> _arg_id_map{}; // Track ArgumentIDs that have already been added
- VarID _num_var{ 0 };
-};
-
-enum class ComponentType
-{
- Simple,
- Complex,
- Store
-};
-
-using ComponentID = DependencyGraph::Id;
-using ComponentList = std::vector<ComponentID>;
-class IClKernelComponent
-{
-public:
- using Link = SharedVarLink;
- using Tag = std::string;
- struct TagVal
- {
- TagVal() = default;
- TagVal(const SharedVarTable::SharedVar &var)
- : value{ var.uniq_name }
- {
- }
-
- template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
- TagVal(T val)
- : value{ support::cpp11::to_string(val) }
- {
- }
-
- TagVal(const std::string &val)
- : value{ val }
- {
- }
-
- TagVal(const char *val)
- : value{ std::string(val) }
- {
- }
-
- TagVal(const DataType &data_type)
- : value{ get_cl_type_from_data_type(data_type) }
- {
- }
-
- std::string value{};
- };
- using TagLUT = std::unordered_map<Tag, TagVal>; // Used to instantiating a code template / replacing tags
-public:
- IClKernelComponent(ClKernelBlueprint *blueprint)
- : _blueprint(blueprint)
- {
- }
-
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClKernelComponent);
-
- virtual ~IClKernelComponent() = default;
- virtual ComponentType get_component_type() const = 0;
- virtual std::vector<Link> get_links() const = 0;
- virtual std::string name() const = 0;
-
- // @note: some tags can be unused since they could be used only for the macros, or only for the component code
- static std::string replace_tags(const std::string &code_template, const TagLUT &tags)
- {
- std::string replaced_code = "";
- bool scanning_pattern = false;
- std::string pattern_found = "";
- for(size_t i = 0; i < code_template.size() - 1; ++i)
- {
- if(!scanning_pattern)
- {
- if(code_template[i] == '{' && code_template[i + 1] == '{')
- {
- i += 1;
- scanning_pattern = true;
- pattern_found = "";
- }
- else
- {
- replaced_code += code_template[i];
- }
- }
- else
- {
- if(code_template[i] == '}' && code_template[i + 1] == '}')
- {
- i += 1;
- scanning_pattern = false;
- std::string err = "Pattern " + pattern_found + " not found in tags";
- ARM_COMPUTE_ERROR_ON_MSG(tags.find(pattern_found) == tags.end(), err.c_str());
- replaced_code += tags.find(pattern_found)->second.value;
- }
- else
- {
- pattern_found += code_template[i];
- }
- }
- }
-
- return replaced_code;
- }
- ComponentID id() const
- {
- return _id;
- }
- void set_id(ComponentID id)
- {
- _id = id;
- }
-
- virtual std::set<std::string> get_headers_list() const
- {
- return std::set<std::string> {};
- }
-
- virtual std::string get_additional_macros() const
- {
- return "";
- }
-
- virtual std::string get_component_code() const
- {
- return "";
- }
-
- virtual Window get_window() const
- {
- return Window{};
- }
-
- /** Get the tag look-up table used to instantiate the component code.
- *
- * @param vtable
- * @return TagLUT
- */
- virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const = 0;
-
- /** Allocate all shared variables used by the component in the @p vtable
- *
- * @param vtable
- */
- virtual void allocate_shared_vars(SharedVarTable &vtable) const = 0;
-
- virtual std::string get_dst_addr_calculation() const
- {
- return "";
- }
-
- /** Generate config id of the component
- *
- * @return std::string
- */
- virtual std::string generate_config_id() const
- {
- return "";
- }
-
- virtual CLBuildOptions generate_build_options() const
- {
- return CLBuildOptions{};
- }
-
-protected:
- ClKernelBlueprint *_blueprint;
-
-private:
- ComponentID _id{};
-};
-
-using ComponentUniquePtr = std::unique_ptr<IClKernelComponent>;
-
-/** Intermediate representation of the final, complete kernel source.
- */
-struct ClKernelBlueprint::Implementation
-{
-public:
- Implementation() = default;
- ~Implementation() = default;
-
-public:
- Status update_merge_point(ArgumentID t_id, ArgumentID merge_point)
- {
- return _graph.update_merge_point(t_id, merge_point);
- }
-
- ArgumentID add_kernel_tensor(ITensorInfo *tensor_info, ArgumentID merge_point = DependencyGraph::empty_id())
- {
- const auto id = _graph.add_tensor(merge_point);
- if(_kernel_tensors.find(id) == _kernel_tensors.end())
- {
- _kernel_tensors.insert(std::make_pair(id, tensor_info));
- }
- return id;
- }
-
- void set_tile_info(const TileDescriptor &tile_info)
- {
- _tile_info = tile_info;
- }
-
- SharedVarGroup group(ArgumentID arg_id) const
- {
- if(arg_id == g_arg_placeholder)
- {
- // In case of placeholder, don't care what we return;
- return SharedVarGroup::Argument;
- }
- return _shared_var_group_lut.at(arg_id);
- }
-
- void validate_arg_ids(std::initializer_list<ArgumentID> args) const
- {
- for(const auto arg_id : args)
- {
- ARM_COMPUTE_UNUSED(arg_id);
- ARM_COMPUTE_ERROR_ON_MSG(_kernel_tensors.find(arg_id) == _kernel_tensors.end() && arg_id != g_arg_placeholder,
- "Trying to use an argument that hasn't been added to the blueprint");
- }
- }
-
- void add_component(ComponentUniquePtr component)
- {
- if(component->get_component_type() == ComponentType::Complex)
- {
- ++_num_complex_components;
- ARM_COMPUTE_ERROR_ON_MSG(_num_complex_components > 1, "Only one complex component per blueprint is supported.");
- }
-
- // Get an unique ID for the component that's being added
- std::vector<ArgumentID> src_tensors;
- std::vector<ArgumentID> dst_tensors;
- for(const auto &link : component->get_links())
- {
- if(link.is_empty())
- {
- continue;
- }
- if(link.io == SharedVarIO::Input)
- {
- src_tensors.push_back(link.arg_id);
- }
- else
- {
- dst_tensors.push_back(link.arg_id);
- }
- }
- const ComponentID component_id = _graph.add_operator(src_tensors, dst_tensors).second;
- component->set_id(component_id);
-
- // Add this component to the component graph. Don't connect it to anything yet
- _component_graph.emplace(component_id, ComponentList{});
-
- // For every { arg_id, arg_io } passed along with this component...
- for(const auto &link : component->get_links())
- {
- const ArgumentID &arg_id = link.arg_id;
- const SharedVarIO &arg_io = link.io;
-
- // Add the arg_id to the map describing the input/output relationship between an argument and the components that use it, if it doesn't yet exist there
- if(_outgoing_components.find(arg_id) == _outgoing_components.end())
- {
- _outgoing_components.emplace(arg_id, ComponentList{});
- _incoming_components.emplace(arg_id, ComponentList{});
- }
-
- // If it's an input argument, connect any other component that has it as output with this component
- // Additionally, set this component as one that treats this argument as "Input" (append to index 0)
- // This is used so that we keep track of whether two components use the same argument, one as input and one as output
- if(arg_io == SharedVarIO::Input)
- {
- for(const auto &prev_component : _incoming_components[arg_id])
- {
- _component_graph[prev_component].push_back(component_id);
- }
-
- _outgoing_components[arg_id].push_back(component_id);
- }
- // If it's an output argument, connect this component with any other component that has it as input
- // Additionally, set this component as one that treats this argument as "Output" (append to index 1)
- else
- {
- if(component->get_component_type() == ComponentType::Store)
- {
- ARM_COMPUTE_ERROR_ON_MSG(_dst_id >= 0, "Trying to add more than one dst argument to the graph");
- _dst_id = arg_id;
- }
-
- for(const auto &subseq_component : _outgoing_components[arg_id])
- {
- _component_graph[component_id].push_back(subseq_component);
- }
-
- _incoming_components[arg_id].push_back(component_id);
- }
- }
-
- ARM_COMPUTE_ERROR_ON_MSG(_graph.get_root_ops().size() != 1, "Trying to add more than one root to the graph");
-
- // Finally, add this component to the dictionary of components
- _components.insert(std::make_pair(component_id, std::move(component)));
- }
-
- std::string build_kernel_name() const
- {
- std::string name = "";
-
- traverse([&](std::stack<ComponentID> stack)
- {
- name += _components.find(stack.top())->second->name() + (stack.size() > 2 ? "___" : "");
- });
-
- return name;
- }
-
- std::string build_code()
- {
- ARM_COMPUTE_ERROR_ON_MSG(_graph_root == -1, "No root found in the component graph");
-
- // These data structures will hold the data from all the components in the blueprint
- std::set<std::string> headers_list{};
- std::set<std::string> additional_macros{};
- std::vector<std::string> component_codes{}; // vector because order matters
-
- // Step 1: Allocate all kernel argument shared variables before generating the component code
- auto stack = topological_sort();
- while(!stack.empty())
- {
- auto curr_component_id = stack.top();
- auto &curr_component = _components.find(curr_component_id)->second;
-
- curr_component->allocate_shared_vars(_vtable);
-
- stack.pop();
- }
- // Step 2: Generate component codes
- stack = topological_sort();
- while(!stack.empty())
- {
- auto curr_component_id = stack.top();
- auto &curr_component = _components.find(curr_component_id)->second;
-
- auto curr_headers_list = curr_component->get_headers_list();
- auto curr_additional_macros = curr_component->get_additional_macros();
- auto curr_component_code = curr_component->get_component_code();
- const auto var_lut = curr_component->get_tag_lut(_vtable); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
- component_codes.push_back(IClKernelComponent::replace_tags(curr_component_code, var_lut));
-
- headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
- if(!curr_additional_macros.empty()) // Some components might not have any
- {
- additional_macros.insert(IClKernelComponent::replace_tags(curr_additional_macros, var_lut));
- }
-
- stack.pop();
- }
-
- // Step 3: Assemble the data gathered by traversing the graph into the string "code"
- std::string code = "";
-
- for(auto &header : headers_list)
- {
-#if defined(EMBEDDED_KERNELS)
- code += CLKernelLibrary::get().get_program(header).first;
-#else // defined(EMBEDDED_KERNELS)
- code += "#include \"" + header + "\"\n";
-#endif // defined(EMBEDDED_KERNELS)
- }
-
- for(auto &macros : additional_macros)
- {
- code += macros;
- }
-
- code += generate_kernel_signature(_vtable.get_kernel_arguments());
-
- code += "\n{\n\n";
-
- code += " //------------------ START KERNEL_BUILDER_COORDINATE ---------------------\n\n";
- code += generate_global_section();
- code += " //------------------ END KERNEL_BUILDER_COORDINATE ---------------------\n";
-
- for(auto &component_code : component_codes)
- {
- code += component_code;
- }
-
- code += "}\n";
-
- return code;
- }
-
- /** Generate config id of the entire kernel
- *
- * Format: kernel_name--comp0_config_id--comp1_config_id--...
- *
- * @return std::string
- */
- std::string build_config_id() const
- {
- std::string config_id = build_kernel_name();
- traverse([&](std::stack<ComponentID> stack)
- {
- config_id += "--" + _components.find(stack.top())->second->generate_config_id() + "--";
- });
-
- return config_id;
- }
-
- CLBuildOptions build_options() const
- {
- CLBuildOptions build_opts{};
-
- traverse([&](std::stack<ComponentID> stack)
- {
- build_opts.add_options(_components.find(stack.top())->second->generate_build_options().options());
- });
-
- return build_opts;
- }
-
- TileDescriptor get_tile_info() const
- {
- return _tile_info;
- }
-
- // Get the global execution window, i.e. that of the root component
- Window get_execution_window() const
- {
- ARM_COMPUTE_ERROR_ON_MSG(_graph_root == -1, "No root found in the component graph");
- ARM_COMPUTE_ERROR_ON_MSG(_dst_id == -1, "Destination Tensor Id should be ready before calling get_execution_window()");
-
- return _components.find(_graph_root)->second->get_window();
- }
-
- ArgumentID get_dst_id() const
- {
- return _dst_id;
- }
-
- ClKernelArgList get_arguments() const
- {
- ClKernelArgList arg_list{};
- for(const auto &arg_var : _vtable.get_kernel_arguments().get_all_vars())
- {
- arg_list[arg_var.desc.arg_id] = arg_var.desc;
- }
- return arg_list;
- }
-
- /** Get the arguments as shared vars from the vtable
- *
- * @return SharedVarTable::Arguments
- */
- SharedVarTable::Arguments get_argument_shared_vars() const
- {
- return _vtable.get_kernel_arguments();
- }
-
- const ITensorInfo *get_kernel_argument_info(const ArgumentID id) const
- {
- auto it = _kernel_tensors.find(id);
- if(it != _kernel_tensors.end())
- {
- return it->second;
- }
- return nullptr;
- }
-
- ITensorInfo *get_kernel_argument_info(const ArgumentID id)
- {
- auto it = _kernel_tensors.find(id);
- if(it != _kernel_tensors.end())
- {
- return it->second;
- }
- return nullptr;
- }
- /** Finalize graph construction. Graph is expected to not mutate after being finalized
- */
- void finalize()
- {
- cache_root_component();
- assign_shared_var_group();
- }
-
- DependencyGraph get_graph() const
- {
- return _graph;
- }
-
-private:
- void cache_root_component()
- {
- const auto roots = _graph.get_root_ops();
- ARM_COMPUTE_ERROR_ON_MSG(roots.size() != 1, "Trying to add more than one root to the graph");
- _graph_root = roots.at(0);
- }
- /** Assign the group for each shared var. Can only be performed at the end of the graph construction, before building
- */
- void assign_shared_var_group()
- {
- for(const auto &tensor : _kernel_tensors)
- {
- const auto tensor_id = tensor.first;
- if(_graph.is_src_tensor(tensor_id) || _graph.is_dst_tensor(tensor_id))
- {
- _shared_var_group_lut[tensor_id] = SharedVarGroup::Argument;
- }
- else
- {
- _shared_var_group_lut[tensor_id] = SharedVarGroup::Automatic;
- }
- }
- }
-
- void topological_sort_utility(ComponentID component_id, std::unordered_set<ComponentID> &visited, std::stack<ComponentID> &stack) const
- {
- visited.insert(component_id);
-
- for(auto connected_component : _component_graph.find(component_id)->second)
- {
- if(visited.find(connected_component) == visited.end())
- {
- topological_sort_utility(connected_component, visited, stack);
- }
- }
-
- stack.push(component_id);
- }
-
- std::stack<ComponentID> topological_sort() const
- {
- std::stack<ComponentID> stack{};
- std::unordered_set<ComponentID> visited{};
-
- topological_sort_utility(_graph_root, visited, stack);
-
- return stack;
- }
-
- void traverse(const std::function<void(std::stack<ComponentID>)> &func) const
- {
- std::stack<ComponentID> stack = topological_sort();
-
- while(!stack.empty())
- {
- func(stack);
- stack.pop();
- }
- }
-
- std::string generate_argument_declaration(const SharedVarTable::SharedVar &var) const
- {
- ARM_COMPUTE_ERROR_ON_MSG(var.group != SharedVarGroup::Argument, "An argument declaration can only be generated from a kernel argument");
- std::string code;
- switch(var.desc.tensor_arg_type)
- {
- case ClKernelTensorArgType::Vector:
- {
- code += "\n VECTOR_DECLARATION(" + var.uniq_name + ")";
- break;
- }
- case ClKernelTensorArgType::Image:
- {
- code += "\n IMAGE_DECLARATION(" + var.uniq_name + ")";
- break;
- }
- case ClKernelTensorArgType::Image_3D:
- {
- code += "\n IMAGE_DECLARATION(" + var.uniq_name + "),";
- code += "\n uint " + var.uniq_name + "_stride_z";
- break;
- }
- case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
- {
- code += "\n __read_only image2d_t " + var.uniq_name + "_img,";
- code += "\n uint " + var.uniq_name + "_stride_z";
- break;
- }
- case ClKernelTensorArgType::Tensor_4D_t_Buffer:
- {
- code += "\n TENSOR4D_T(" + var.uniq_name + ", BUFFER)";
- break;
- }
- case ClKernelTensorArgType::Tensor_4D_t_Image:
- {
- code += "\n TENSOR4D_T(" + var.uniq_name + ", IMAGE)";
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported declaration generation for ClKernelTensorArgType");
- }
- }
- return code;
- }
-
- std::string generate_kernel_signature(const SharedVarTable::Arguments &argument_list) const
- {
- std::string code = "\n__kernel void " + build_kernel_name() + "(";
-
- for(const auto &arg : argument_list.get_all_vars())
- {
- code += generate_argument_declaration(arg) + ",";
- }
-
- code[code.length() - 1] = ')';
-
- return code;
- }
-
- std::string generate_global_section() const
- {
- auto dst_info = get_kernel_argument_info(_dst_id);
- auto dst_w = dst_info->dimension(0);
- const auto tile_w = std::max(1, get_execution_window().x().step());
- const auto tile_h = std::max(1, get_execution_window().y().step());
- auto leftover_w = dst_w % tile_w;
-
- std::string code = "";
- code += std::string(" int cout = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
- code += std::string(" int mout = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
- code += std::string(" int bout = GET_SPATIAL_IDX(2, 1, 0);\n\n");
-
- switch(_tile_info.clipping)
- {
- case ClippingStrategy::TOP_LEFT:
- code += " const bool g_cond_x = (cout == 0);\n";
- code += " const bool g_cond_y = (mout == 0);\n";
- break;
- case ClippingStrategy::TOP_RIGHT:
- code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
- code += " const bool g_cond_y = (mout == 0);\n";
- break;
- case ClippingStrategy::BOTTOM_LEFT:
- code += " const bool g_cond_x = (cout == 0);\n";
- code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
- break;
- case ClippingStrategy::BOTTOM_RIGHT:
- code += " const bool g_cond_x = ((cout + 1) * " + std::to_string(tile_w) + " >= " + std::to_string(_tile_info.boundaries.x()) + ");\n";
- code += " const bool g_cond_y = ((mout + 1) * " + std::to_string(tile_h) + " >= " + std::to_string(_tile_info.boundaries.y()) + ");\n";
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported clipping strategy");
- }
-
- return code;
- }
-
- TileDescriptor _tile_info{};
-
- int32_t _num_complex_components{};
-
- ArgumentID _dst_id{ -1 }; // Initially set to -1, which means the graph has no dst yet, since node IDs are positive numbers
-
- DependencyGraph _graph{};
-
- // Tensors, components and IDs with corresponding ptrs (except intermediate)
- std::unordered_map<ComponentID, ComponentUniquePtr> _components{};
- std::unordered_map<ArgumentID, ITensorInfo *> _kernel_tensors{};
- // Argument group lookup. Can be replaced by extending the ArgumentID type to include group info
- std::unordered_map<ArgumentID, SharedVarGroup> _shared_var_group_lut{};
-
- // Tracks all variables (e.g.: kernel arguments, kernel "global variables")
- SharedVarTable _vtable{};
-
- // Component directed graph (represented by an adjecency list of Component IDs)
- // This is used to understand the ordering and bindings between components when generating the kernel
- // It's initially set to -1 which means the graph has no root yet, since node IDs are positive numbers
- ComponentID _graph_root{ -1 };
- std::unordered_map<ComponentID, ComponentList> _component_graph{};
-
- // Additional data structures used to define the relationships between components and arguments
- // For each argument, it contains the list of components that consider it as an incoming or an outgoing argument
- // E.g. tensor0 -> component0 -> tensor1
- // _outgoing_components[tensor0] == {component0} (component0 is the outgoing component of tensor0. Component0 treats tensor0 as an input tensor)
- // _incoming_components[tensor1] == {component0} (component0 is the incoming component of tensor1. Component1 treats tensor1 as an output tensor)
- std::unordered_map<ArgumentID, ComponentList> _outgoing_components{};
- std::unordered_map<ArgumentID, ComponentList> _incoming_components{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMMON_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
deleted file mode 100644
index 1b10050559..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Utils.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-inline ::std::ostream &operator<<(::std::ostream &os, const CLBuildOptions::StringSet &build_opts)
-{
- for(const auto &opt : build_opts)
- {
- os << opt << ",";
- }
- return os;
-}
-inline ::std::ostream &operator<<(::std::ostream &os, const CLBuildOptions &cl_build_opts)
-{
- os << cl_build_opts.options();
- return os;
-}
-
-inline std::string to_string(const CLBuildOptions &cl_build_opts)
-{
- std::stringstream str;
- str << cl_build_opts;
- return str.str();
-}
-inline ::std::ostream &operator<<(::std::ostream &os, const ClKernelCode &code)
-{
- os << "name: " << code.name << std::endl;
- os << "code: " << code.code << std::endl;
- os << "build_opts: " << code.build_options << std::endl;
- return os;
-}
-inline std::string to_string(const ClKernelCode &code)
-{
- std::stringstream str;
- str << code;
- return str.str();
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_UTILS
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
deleted file mode 100644
index 811cd79811..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
-
-#include "arm_compute/runtime/CL/CLScheduler.h"
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClDirectConvolutionKernelComponent::get_component_type() const
-{
- return ComponentType::Complex;
-}
-
-std::set<std::string> ClDirectConvolutionKernelComponent::get_headers_list() const
-{
- return std::set<std::string> { "helpers.h", "tile_helpers.h" };
-}
-
-Window ClDirectConvolutionKernelComponent::get_window() const
-{
- const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
- auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- // Get dst shape
- PadStrideInfo pad_stride_info
- {
- static_cast<unsigned int>(_desc.conv2d.stride.x()),
- static_cast<unsigned int>(_desc.conv2d.stride.y()),
- static_cast<unsigned int>(_desc.conv2d.pad.left),
- static_cast<unsigned int>(_desc.conv2d.pad.right),
- static_cast<unsigned int>(_desc.conv2d.pad.top),
- static_cast<unsigned int>(_desc.conv2d.pad.bottom),
- DimensionRoundingType::FLOOR /*default rounding type*/
- };
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src_info, *weight_info, pad_stride_info);
-
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*dst_info, output_shape,
- 1,
- src_info->data_type(),
- src_info->quantization_info());
-
- const unsigned int vec_size = std::min(static_cast<unsigned int>(dst_info->tensor_shape()[0]), 4u);
- const unsigned int num_rows = (dst_info->tensor_shape()[0] > 16) ? ((src_info->data_type() == DataType::F32) ? 2U : 4U) : 1U;
- // const unsigned int num_rows = 1;
- // const unsigned int vec_size = tile_info.tile_dims.x();
- // const unsigned int num_rows = tile_info.tile_dims.y();
-
- // Create and configure kernel window
- Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
-
- const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows);
- win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows));
- win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1));
-
- return win;
-}
-
-std::string ClDirectConvolutionKernelComponent::get_additional_macros() const
-{
- return R"_()_"; // no macros
-}
-
-std::string ClDirectConvolutionKernelComponent::get_component_code() const
-{
- const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
-
- ARM_COMPUTE_ERROR_ON_MSG(src_info->data_layout() != DataLayout::NHWC, "Only NHWC data layout is supported by this component.");
-
- const auto channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
- const auto k0 = adjust_vec_size(is_data_type_quantized(src_info->data_type()) ? 16u : 8u, src_info->dimension(channel_idx));
- const bool leftover_loop = (src_info->dimension(channel_idx) % k0) != 0;
-
- std::string code = R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ---------------------
- // IN_0(src) {{src}}
- // IN_1(wei) {{weight}}
- )_";
- if(bias_info != nullptr)
- {
- code += R"_(
- // IN_1(bia) {{bias}}
- )_";
- }
- code += R"_(
- // OUT(dst, accum) {{dst}}
-
- // Initialize the accumulators
- TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}});
- {
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
- #define _IWEI_WIDTH {{WEI_WIDTH}}
- #define _IWEI_HEIGHT {{WEI_HEIGHT}}
- #define _ISRC_WIDTH {{src}}_w
- #define _ISRC_HEIGHT {{src}}_h
- #define _ISRC_CHANNELS {{src}}_c
- #define _IDST_WIDTH {{arg_dst}}_w
- #define _IDST_HEIGHT {{arg_dst}}_h
- #define _IDST_CHANNELS {{arg_dst}}_c
- #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
-
- // .v = access the whole vector (OpenCL vector)
- // .s[x] = access the vector element at position x (scalar access)
- TILE(int, M0, 1, xi);
- TILE(int, M0, 1, yi);
-
- // Convert the linear index to coordinate
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- xi[i].v = ((mout + i) % _IDST_WIDTH) * {{STRIDE_X}};
- yi[i].v = ((mout + i) / _IDST_WIDTH) * {{STRIDE_Y}};
- xi[i].v -= {{PAD_LEFT}};
- yi[i].v -= {{PAD_TOP}};
- })
-
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- {{dst}}[i].v = 0;
- })
-
- for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
- {
- int ck = 0;
- int xk = i % _IWEI_WIDTH;
- int yk = i / _IWEI_HEIGHT;
-
- int k = 0;
- for(; k <= (_ISRC_CHANNELS - K0); k += K0)
- {
- TILE({{SRC_DATA_TYPE}}, M0, K0, a);
- TILE({{WEI_DATA_TYPE}}, N0, K0, b);
-
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- a[i].v = {{ZERO_VALUE}};
- })
-
- // Load tile from the src tensor
- T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
-
- // Load tile from the weights tensor
- T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
-
- // Compute the matrix multiplication between two tiles
- T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}});
-
- ck += K0;
- }
-
- // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
- // This #if directive should be removed in case of dynamic tensor support
- )_";
-
- if(leftover_loop)
- {
- code += R"_(
- // Left-over accumulations
- for(; k < _ISRC_CHANNELS; ++k)
- {
- TILE({{SRC_DATA_TYPE}}, M0, 1, a);
- TILE({{WEI_DATA_TYPE}}, N0, 1, b);
-
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- a[i].v = {{ZERO_VALUE}};
- })
-
- // Load tile from the src tensor
- T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a);
-
- // Load tile from the weights tensor
- // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
- T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b);
-
- // Compute the matrix multiplication between two tiles
- T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
-
- ++ck;
- }
- )_";
- }
-
- code += R"_(
- #undef _I_WEI_WIDTH
- #undef _I_WEI_HEIGHT
- #undef _ISRC_WIDTH
- #undef _ISRC_HEIGHT
- #undef _ISRC_CHANNELS
- #undef _IDST_WIDTH
- #undef _IDST_HEIGHT
- #undef _IDST_CHANNELS
- #undef _IY_MULTIPLIER
-
- }
- )_";
-
- if(bias_info != nullptr)
- {
- code += R"_(
- TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
-
- T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, cout, 0, 1, 0, bias0);
-
- // c = c + bias[broadcasted]
- T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
- )_";
- }
-
- code += R"_(
- }
-//------------------ END KERNEL {{meta_kernel_id}} ---------------------
- )_";
- return code.c_str();
-}
-
-bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
-{
- if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
- {
- return false;
- }
-
- // If not floating point
- if(!is_data_type_float(tensor->data_type()))
- {
- return false;
- }
-
- if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
- {
- return false;
- }
-
- // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
- if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
- {
- return false;
- }
-
- // Check cl image pitch alignment
- if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
- {
- return false;
- }
-
- const size_t image_w = tensor->tensor_shape()[0] / 4;
- const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
- const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
- const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
- if(image_w > max_image_w || image_h > max_image_h)
- {
- return false;
- }
-
- return true;
-}
-
-CLBuildOptions ClDirectConvolutionKernelComponent::generate_build_options() const
-{
- const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
- const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- // const auto tile_info = _blueprint->impl().get_tile_info();
-
- const unsigned int channel_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::CHANNEL);
- const DataType data_type = src_info->data_type();
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- const unsigned int n0 = _blueprint->impl().get_execution_window().x().step();
- const unsigned int m0 = _blueprint->impl().get_execution_window().y().step();
- const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src_info->dimension(channel_idx));
- const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
- const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
-
- // Update the padding for the weights tensor if we can export to cl_image
- if(export_to_cl_image)
- {
- arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weight_info);
- }
-
- CLBuildOptions build_opts{};
- build_opts.add_option("-cl-fast-relaxed-math");
- build_opts.add_option("-DIS_TILED");
- build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DK0=" + support::cpp11::to_string(k0));
- build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
-
- return build_opts;
-}
-
-void ClDirectConvolutionKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
-{
- const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
-
- vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
-
- const GPUTarget gpu_target = CLScheduler::get().target();
- const bool export_to_cl_image = export_to_cl_image_support(weight_info, gpu_target, src_info->data_layout());
- const ClKernelTensorArgType weight_type = export_to_cl_image ? ClKernelTensorArgType::Tensor_4D_t_Image : ClKernelTensorArgType::Tensor_4D_t_Buffer;
- vtable.add(_weight, _blueprint->impl().group(_weight.arg_id), ClKernelArgDescriptor(_weight.arg_id, weight_type), "weight");
-
- if(!_bias.is_empty()) // optional bias
- {
- vtable.add(_bias, _blueprint->impl().group(_bias.arg_id), ClKernelArgDescriptor(_bias.arg_id, ClKernelTensorArgType::Vector), "bias");
- }
- vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
-}
-
-ClDirectConvolutionKernelComponent::TagLUT ClDirectConvolutionKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
-{
- TagLUT lut{};
-
- const auto src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- const auto weight_info = _blueprint->impl().get_kernel_argument_info(_weight.arg_id);
- const auto bias_info = _blueprint->impl().get_kernel_argument_info(_bias.arg_id);
-
- // Arguments and global shared variables
- lut["src"] = vtable.get(_src);
- lut["weight"] = vtable.get(_weight);
-
- if(!_bias.is_empty()) // optional bias
- {
- lut["bias"] = vtable.get(_bias);
- lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(bias_info->data_type());
- }
- lut["dst"] = vtable.get(_dst);
-
- const auto dst_argument = _blueprint->impl().get_argument_shared_vars().get_dst_var();
- lut["arg_dst"] = dst_argument.uniq_name;
-
- // Local build options
- lut["meta_kernel_id"] = id();
- lut["ACC_DATA_TYPE"] = src_info->data_type();
- lut["SRC_DATA_TYPE"] = src_info->data_type();
- lut["WEI_DATA_TYPE"] = weight_info->data_type();
-
- lut["SRC_TENSOR_TYPE"] = "BUFFER";
- switch(vtable.get(_weight).desc.tensor_arg_type)
- {
- case ClKernelTensorArgType::Image_Export_To_ClImage2D:
- case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
- case ClKernelTensorArgType::Tensor_4D_t_Image:
- {
- lut["WEI_TENSOR_TYPE"] = "IMAGE";
- break;
- }
- default:
- {
- lut["WEI_TENSOR_TYPE"] = "BUFFER";
- break;
- }
- }
- const auto width_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::WIDTH);
- const auto height_idx = get_data_layout_dimension_index(src_info->data_layout(), DataLayoutDimension::HEIGHT);
- lut["WEI_WIDTH"] = weight_info->dimension(width_idx);
- lut["WEI_HEIGHT"] = weight_info->dimension(height_idx);
-
- lut["STRIDE_X"] = _desc.conv2d.stride.x();
- lut["STRIDE_Y"] = _desc.conv2d.stride.y();
-
- lut["PAD_LEFT"] = _desc.conv2d.pad.left;
- lut["PAD_TOP"] = _desc.conv2d.pad.top;
-
- lut["ZERO_VALUE"] = 0;
-
- return lut;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
deleted file mode 100644
index 5babdbab51..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-
-#include "utils/TypePrinter.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClDirectConvolutionKernelComponent : public IClKernelComponent
-{
-public:
- ClDirectConvolutionKernelComponent(ClKernelBlueprint *blueprint, const ClDirectConv2dKernelDescriptor &desc,
- const Link &src, const Link &weight, const Link &dst, const Link &bias = Link{})
- : IClKernelComponent(blueprint), _desc{ desc }, _src{ src }, _weight{ weight }, _bias{ bias }, _dst{ dst }
- {
- }
-
- ComponentType get_component_type() const override;
- std::set<std::string> get_headers_list() const override;
- std::string get_additional_macros() const override;
- std::string get_component_code() const override;
- Window get_window() const override;
- ClKernelArgList get_args();
- CLBuildOptions generate_build_options() const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _src, _weight, _bias, _dst };
- }
-
- virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
- virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
-
- virtual std::string name() const override
- {
- return "direct_convolution_" + to_string(_blueprint->impl().get_kernel_argument_info(_src.arg_id)->data_layout()) + "_" + std::to_string(id());
- }
-
-private:
- ClDirectConv2dKernelDescriptor _desc{};
- Link _src{};
- Link _weight{};
- Link _bias{};
- Link _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLDIRECTCONVOLUTIONKERNELCOMPONENT_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp
deleted file mode 100644
index e2eba68a63..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClElementwiseKernelComponent::get_component_type() const
-{
- return ComponentType::Simple;
-}
-
-std::set<std::string> ClElementwiseKernelComponent::get_headers_list() const
-{
- return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
-}
-
-Window ClElementwiseKernelComponent::get_window() const
-{
- const ITensorInfo *lhs_info = _blueprint->impl().get_kernel_argument_info(_lhs.arg_id);
- const ITensorInfo *rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs_info, rhs_info, dst_info);
-
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs_info, *rhs_info);
- const TensorShape &out_shape = broadcast_pair.first;
-
- auto_init_if_empty(*dst_info, out_shape, 1, lhs_info->data_type());
-
- TensorShape output_shape = dst_info->tensor_shape();
- // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
- // This is in line with the collapsing convention used by Conv2d
- output_shape.collapse(2U, 1U);
- const unsigned int vector_size_byte_opencl = 16;
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
- Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
-
- return win;
-}
-
-std::string ClElementwiseKernelComponent::get_component_code() const
-{
- std::string code;
- const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
-
- if(is_root)
- {
- return R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
- // IN_0(LHS) {{lhs}}
- // IN_1(RHS) {{rhs}}
- // OUT(dst, accum) {{dst}}
-
- // dst = lhs + rhs (mix-precision, broadcast, boundary aware)
- TILE({{DATA_TYPE}}, M0, N0, {{dst}});
- {
- TILE({{DATA_TYPE}}, M0, N0, lhs_tile);
- TILE({{DATA_TYPE}}, M0, N0, rhs_tile);
-
- // Since mout maps to dimensions 1 (y) and dimension 2 (z) of the input tensor because of the collapsed window, bout maps to dimension 3 (w)
- {{lhs}}_offset_first_element_in_bytes += bout * {{lhs}}_stride_w;
- {{rhs}}_offset_first_element_in_bytes += bout * {{rhs}}_stride_w;
-
- T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, cout, mout, 1, {{lhs}}_stride_y, lhs_tile);
- T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_x}}, {{rhs_start_y}}, 1, {{rhs}}_stride_y, rhs_tile);
-
-#if defined(IS_BROADCAST)
- T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
-#else // !defined(IS_BROADCAST)
- T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}});
-#endif // defined(IS_BROADCAST)
-
- }
- //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_";
- }
- else
- {
- return R"_(
- //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
- // IN_0/Out(Accumulator) {{acc}}
- // IN_1(Addend) {{addend}}
-
- // acc = addend + acc (mix-precision, broadcast, boundary aware)
- {
- TILE({{DATA_TYPE}}, M0, N0, addend_tile);
-
- T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{addend}}, {{rhs_start_x}}, {{rhs_start_y}}, 1, {{addend}}_stride_y, addend_tile);
-
-#if defined(IS_BROADCAST)
- T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
-#else // !defined(IS_BROADCAST)
- T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, addend_tile, {{acc}});
-#endif // defined(IS_BROADCAST)
- }
- //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP ---------------------
-)_";
- }
-}
-
-CLBuildOptions ClElementwiseKernelComponent::generate_build_options() const
-{
- const auto t_rhs_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- CLBuildOptions build_opts{};
- const auto n0 = _blueprint->impl().get_execution_window().x().step();
- const auto m0 = _blueprint->impl().get_execution_window().y().step();
- const unsigned int partial_store_n0 = t_dst_info->dimension(0) % n0;
- const bool is_broadcast = t_rhs_info->tensor_shape() != t_dst_info->tensor_shape();
-
- build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
- build_opts.add_option_if(is_broadcast, "-DIS_BROADCAST");
-
- return build_opts;
-}
-
-std::string ClElementwiseKernelComponent::generate_config_id() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- std::string config_id{};
- config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(0));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(1));
- config_id += "_";
- config_id += lower_string(string_from_data_layout(t_dst_info->data_layout()));
- return config_id;
-}
-
-void ClElementwiseKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
-{
- const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
- vtable.add(_lhs, _blueprint->impl().group(_lhs.arg_id), ClKernelArgDescriptor(_lhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "lhs");
- vtable.add(_rhs, _blueprint->impl().group(_rhs.arg_id), ClKernelArgDescriptor(_rhs.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "rhs");
- if(is_root)
- {
- vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
- }
-}
-
-ClElementwiseKernelComponent::TagLUT ClElementwiseKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
-{
- TagLUT lut{};
- const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- ITensorInfo *t_addend_info = nullptr;
- // Arguments and global shared variables
- const bool is_root = _blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Argument && _blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Argument;
- if(is_root)
- {
- lut["lhs"] = vtable.get(_lhs);
- lut["rhs"] = vtable.get(_rhs);
- lut["dst"] = vtable.get(_dst);
- t_addend_info = _blueprint->impl().get_kernel_argument_info(_rhs.arg_id);
- }
- else
- {
- // Determine which link is the accumulator
- Link accumulator;
- Link addend;
- if(_blueprint->impl().group(_lhs.arg_id) == SharedVarGroup::Automatic)
- {
- accumulator = _lhs;
- addend = _rhs;
- }
- else if(_blueprint->impl().group(_rhs.arg_id) == SharedVarGroup::Automatic)
- {
- accumulator = _rhs;
- addend = _lhs;
- }
- else
- {
- ARM_COMPUTE_ERROR("Invalid elementwise component linking");
- }
- lut["acc"] = vtable.get(accumulator);
- lut["addend"] = vtable.get(addend);
- t_addend_info = _blueprint->impl().get_kernel_argument_info(addend.arg_id);
- }
- // Local build options
- lut["meta_kernel_id"] = id();
- lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type());
-
- switch(_desc.eltwise.op)
- {
- case ArithmeticOperation::DIV:
- lut["ELTWISE_OP"] = "DIV";
- break;
- case ArithmeticOperation::ADD:
- lut["ELTWISE_OP"] = "ADD";
- break;
- default:
- ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
- }
-
- // Set broadcast parameters
- // PRE: All tensors are broadcast-compatible
- const bool is_broadcast = t_addend_info->tensor_shape() != t_dst_info->tensor_shape();
- if(is_broadcast)
- {
- // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy
- if(t_addend_info->dimension(0) == 1U && t_addend_info->dimension(1) == 1U && t_addend_info->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1]
- {
- lut["rhs_m0"] = "1";
- lut["rhs_n0"] = "1";
- lut["rhs_start_y"] = "0";
- lut["rhs_start_x"] = "0";
- }
- else if(t_addend_info->dimension(1) == 1U && t_addend_info->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN]
- {
- lut["rhs_m0"] = "1";
- lut["rhs_n0"] = "N0";
- lut["rhs_start_y"] = "0";
- lut["rhs_start_x"] = "cout";
- }
- else
- {
- ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions");
- }
- }
- else
- {
- lut["rhs_m0"] = "M0";
- lut["rhs_n0"] = "N0";
- lut["rhs_start_y"] = "mout";
- lut["rhs_start_x"] = "cout";
- }
- return lut;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h
deleted file mode 100644
index f8377457d3..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClElementwiseKernelComponent : public IClKernelComponent
-{
-public:
- /** Construct a new Cl Elementwise Kernel Component object
- *
- * @param[in] blueprint Blueprint to which this component is added
- * @param[in] desc Component descriptor
- * @param[in] lhs Link to LHS tensor
- * @param[in] rhs Link to RHS tensor
- * @param[out] dst Link to DST tensor
- *
- * Support Level
- * Data Type: F16, F32
- * Tensor Shape: Any shape of arbitrary dimension >= 1 and <= 4
- * Value Range: All
- * Broadcasting: Only RHS tensor can be broadcasted into LHS. Only support broadcasting in dimension 1 and dimension 2 or all dimension 0, 1 and 2
- */
- ClElementwiseKernelComponent(ClKernelBlueprint *blueprint, const ClElementwiseKernelDescriptor &desc, const Link &lhs, const Link &rhs, const Link &dst)
- : IClKernelComponent(blueprint), _desc{ desc }, _lhs{ lhs }, _rhs{ rhs }, _dst{ dst }
- {
- }
-
- ComponentType get_component_type() const override;
- std::set<std::string> get_headers_list() const override;
- std::string get_component_code() const override;
- Window get_window() const override;
- CLBuildOptions generate_build_options() const override;
- std::string generate_config_id() const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _lhs, _rhs, _dst };
- }
-
- virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
- virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
-
- virtual std::string name() const override
- {
- return "eltwise_add_" + std::to_string(id());
- }
-
-private:
- ClElementwiseKernelDescriptor _desc{};
- Link _lhs{};
- Link _rhs{};
- Link _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLELEMENTWISEADDKERNELCOMPONENT_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp
deleted file mode 100644
index 0a20a8f600..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClFloorKernelComponent::get_component_type() const
-{
- return ComponentType::Simple;
-}
-std::set<std::string> ClFloorKernelComponent::get_headers_list() const
-{
- return std::set<std::string> { "common/experimental/gemm_fused_post_ops/fp_mixed_precision_helpers.h", "tile_helpers.h" };
-}
-Window ClFloorKernelComponent::get_window() const
-{
- const ITensorInfo *src_info = _blueprint->impl().get_kernel_argument_info(_src.arg_id);
- ITensorInfo *dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(src_info, dst_info);
- auto_init_if_empty(*dst_info, src_info->tensor_shape(), 1, src_info->data_type());
-
- TensorShape output_shape = dst_info->tensor_shape();
- // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
- // This is in line with the collapsing convention used by Conv2d
- output_shape.collapse(2U, 1U);
- const unsigned int vector_size_byte_opencl = 16;
- const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst_info->element_size(), dst_info->dimension(0));
- Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
-
- return win;
-}
-std::string ClFloorKernelComponent::get_component_code() const
-{
- bool is_root = _blueprint->impl().group(_src.arg_id) == SharedVarGroup::Argument;
- if(is_root)
- {
- return R"_(
- //------------------ START KERNEL {{meta_kernel_id}} FLOOR ---------------------
- // IN_0(src) {{src}}
- // OUT(dst, accum) {{dst}}
- TILE({{DATA_TYPE}}, M0, N0, {{dst}});
- {
- TILE({{DATA_TYPE}}, M0, N0, src_tile);
-
- // Since mout maps to dimensions 1 (y) and dimension 2 (z) of the input tensor because of the collapsed window, bout maps to dimension 3 (w)
- {{src}}_offset_first_element_in_bytes += bout * {{src}}_stride_w;
- T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{src}}, cout, mout, 1, {{src}}_stride_y, src_tile);
-
- T_FLOOR({{DATA_TYPE}}, M0, N0, src_tile, {{dst}});
- }
- //------------------ END KERNEL {{meta_kernel_id}} FLOOR ---------------------
-)_";
- }
- else
- {
- return R"_(
- //------------------ START KERNEL {{meta_kernel_id}} FLOOR ---------------------
- // IN_0/Out(Accumulator) {{acc}}
- // output = floor(input)
- {
- T_FLOOR({{DATA_TYPE}}, M0, N0, {{acc}}, {{acc}});
- }
- //------------------ END KERNEL {{meta_kernel_id}} FLOOR ---------------------
-)_";
- }
-}
-CLBuildOptions ClFloorKernelComponent::generate_build_options() const
-{
- CLBuildOptions build_opts{};
- const auto n0 = _blueprint->impl().get_execution_window().x().step();
- const auto m0 = _blueprint->impl().get_execution_window().y().step();
- const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- const unsigned int partial_store_n0 = dst_info->dimension(0) % n0;
- build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
- return build_opts;
-}
-std::string ClFloorKernelComponent::generate_config_id() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- std::string config_id{};
- config_id += lower_string(string_from_data_type(t_dst_info->data_type()));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(0));
- config_id += "_";
- config_id += support::cpp11::to_string(t_dst_info->dimension(1));
- config_id += "_";
- config_id += lower_string(string_from_data_layout(t_dst_info->data_layout()));
- return config_id;
-}
-void ClFloorKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
-{
- vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
- vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
-}
-ClFloorKernelComponent::TagLUT ClFloorKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
-{
- TagLUT lut{};
- const auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- // Arguments and global shared variables
- const bool is_root = _blueprint->impl().group(_src.arg_id) == SharedVarGroup::Argument;
-
- if(is_root)
- {
- lut["src"] = vtable.get(_src);
- lut["dst"] = vtable.get(_dst);
- }
- else
- {
- lut["acc"] = vtable.get(_src);
- }
-
- lut["meta_kernel_id"] = id();
- lut["DATA_TYPE"] = get_cl_type_from_data_type(t_dst_info->data_type());
- return lut;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h
deleted file mode 100644
index e791b36382..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClFloorKernelComponent : public IClKernelComponent
-{
-public:
- /** Construct a new Cl Floor Kernel Component object
- *
- * @param blueprint Blueprint to which this component is added
- * @param src Link to SRC tensor
- * @param dst Link to DST tensor
- *
- * Support Level
- * Data Type: F16, F32
- * Tensor Shape: Any shape of arbitrary dimension >= 1 and <= 4
- * Value Range: All
- */
- ClFloorKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
- : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
- {
- }
-
- ComponentType get_component_type() const override;
- std::set<std::string> get_headers_list() const override;
- std::string get_component_code() const override;
- Window get_window() const override;
- CLBuildOptions generate_build_options() const override;
- std::string generate_config_id() const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _src, _dst };
- }
-
- virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
- virtual void allocate_shared_vars(SharedVarTable &vtable) const override;
-
- virtual std::string name() const override
- {
- return "floor_" + std::to_string(id());
- }
-
-private:
- Link _src{};
- Link _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLFLOORKERNELCOMPONENT_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
deleted file mode 100644
index 3f99dd5553..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClKernelComponents.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClDirectConvolutionKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClElementwiseKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClFloorKernelComponent.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
-
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_CLKERNELCOMPONENTS_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
deleted file mode 100644
index 7c805d5368..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-ComponentType ClStoreBlockBoundaryAwareKernelComponent::get_component_type() const
-{
- return ComponentType::Store;
-}
-
-std::string ClStoreBlockBoundaryAwareKernelComponent::get_component_code() const
-{
- return R"_(
- //------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
-
- __global uchar *dst_addr = {{dst}}_ptr + {{dst}}_offset_first_element_in_bytes + (g_x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(g_y, M0, PARTIAL_STORE_M0) * {{dst}}_stride_y);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst_addr += g_z * {{dst}}_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
- // Add offset for batched GEMM
- dst_addr += g_z * {{dst}}_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
- STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, {{src}}, dst_addr, {{dst}}_stride_y, g_zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, g_cond_y, g_cond_x);
-
- //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
-
-)_";
-}
-
-CLBuildOptions ClStoreBlockBoundaryAwareKernelComponent::generate_build_options() const
-{
- auto t_dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- // auto tile_info = _blueprint->impl().get_tile_info();
-
- CLBuildOptions build_opts{};
-
- const auto n0 = _blueprint->impl().get_execution_window().x().step();
- const auto m0 = _blueprint->impl().get_execution_window().y().step();
- const auto partial_m0 = t_dst_info->dimension(0) % m0;
- const auto partial_n0 = t_dst_info->dimension(1) % n0;
-
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(t_dst_info->data_type()));
- build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
- build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_m0));
- build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_n0));
-
- return build_opts;
-}
-
-void ClStoreBlockBoundaryAwareKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
-{
- vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Image_3D), "src");
- vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Image_3D), "dst");
-}
-
-ClStoreBlockBoundaryAwareKernelComponent::TagLUT ClStoreBlockBoundaryAwareKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
-{
- return {
- { "meta_kernel_id", id() },
- { "src", vtable.get(_src) },
- { "dst", vtable.get(_dst) },
- };
-}
-
-ComponentType ClStoreIndirectWidthSelectKernelComponent::get_component_type() const
-{
- return ComponentType::Store;
-}
-
-std::string ClStoreIndirectWidthSelectKernelComponent::get_component_code() const
-{
- return R"_(
- //------------------ START KERNEL {{meta_kernel_id}} STORE ---------------------
- {
- // This also follows NHWC layout
- // cout maps to global_id(0) maps to Channel
- // mout maps to global_id(1) maps to Height and Weight (Collapsed Window)
- // bout maps to global_id(3) maps to N / Batch
- #define _IDST_WIDTH {{dst}}_w
- #define _IDST_HEIGHT {{dst}}_h
- TILE(uint, M0, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, i, 0, 1, M0,
- {
- dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
- dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
- })
-
- bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
- T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, cout, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y);
-
- #undef _IDST_WIDTH
- #undef _IDST_HEIGHT
- //------------------ END KERNEL {{meta_kernel_id}} STORE ---------------------
- }
-
-)_";
-}
-
-CLBuildOptions ClStoreIndirectWidthSelectKernelComponent::generate_build_options() const
-{
- CLBuildOptions build_opts{};
-
- return build_opts;
-}
-
-void ClStoreIndirectWidthSelectKernelComponent::allocate_shared_vars(SharedVarTable &vtable) const
-{
- vtable.add(_src, _blueprint->impl().group(_src.arg_id), ClKernelArgDescriptor(_src.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "src");
- vtable.add(_dst, _blueprint->impl().group(_dst.arg_id), ClKernelArgDescriptor(_dst.arg_id, ClKernelTensorArgType::Tensor_4D_t_Buffer), "dst");
-}
-
-ClStoreIndirectWidthSelectKernelComponent::TagLUT ClStoreIndirectWidthSelectKernelComponent::get_tag_lut(const SharedVarTable &vtable) const
-{
- TagLUT lut{};
-
- // Arguments and global shared variables
- lut["src"] = vtable.get(_src);
- lut["dst"] = vtable.get(_dst);
-
- // Local build options
- lut["meta_kernel_id"] = id();
- lut["DST_TENSOR_TYPE"] = "BUFFER";
- const auto dst_info = _blueprint->impl().get_kernel_argument_info(_blueprint->impl().get_dst_id());
- lut["DST_DATA_TYPE"] = dst_info->data_type();
-
- return lut;
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h b/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
deleted file mode 100644
index e0b188dc8d..0000000000
--- a/src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/components/ClStoreKernelComponents.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingImpl/Common.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-class ClStoreBlockBoundaryAwareKernelComponent : public IClKernelComponent
-{
-public:
- ClStoreBlockBoundaryAwareKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
- : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
- {
- }
- ComponentType get_component_type() const override;
- std::string get_component_code() const override;
- CLBuildOptions generate_build_options() const override;
- TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
- void allocate_shared_vars(SharedVarTable &vtable) const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _src, _dst };
- }
-
- virtual std::string name() const override
- {
- return "";
- }
-
-private:
- Link _src{};
- Link _dst{};
-};
-
-class ClStoreIndirectWidthSelectKernelComponent : public IClKernelComponent
-{
-public:
- ClStoreIndirectWidthSelectKernelComponent(ClKernelBlueprint *blueprint, const Link &src, const Link &dst)
- : IClKernelComponent(blueprint), _src{ src }, _dst{ dst }
- {
- }
- ComponentType get_component_type() const override;
- std::string get_component_code() const override;
- CLBuildOptions generate_build_options() const override;
- virtual TagLUT get_tag_lut(const SharedVarTable &vtable) const override;
- void allocate_shared_vars(SharedVarTable &vtable) const override;
-
- virtual std::vector<Link> get_links() const override
- {
- return { _src, _dst };
- }
-
- virtual std::string name() const override
- {
- return "";
- }
-
-private:
- Link _src{};
- Link _dst{};
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_IMPL_COMPONENTS_CLSTOREKERNELCOMPONENTS_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp b/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
deleted file mode 100644
index bd88afdb47..0000000000
--- a/src/core/experimental/dynamic_fusion/OperatorGraph.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/experimental/OperatorGraph.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-void check_dependency_graph_op_success(OperatorGraph &graph, const Status &status)
-{
- if(!bool(status))
- {
- graph.impl()->status = Status{ status.error_code(), "Cycles or loops are not allowed" };
- }
-}
-
-// Check if there are more than one roots in the graph
-void check_multiple_roots(OperatorGraph &graph)
-{
- if(graph.impl()->graph.get_root_ops().size() > 1)
- {
- graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Multiple roots are not allowed" };
- }
-}
-
-void check_execution_shape(OperatorGraph &graph, const ITensorInfo &dst_info)
-{
- const auto roots = graph.impl()->graph.get_root_ops();
- for(auto root : roots)
- {
- // We assume exactly 1 dst tensor for all operators
- const auto root_info = graph.impl()->tensors[graph.impl()->graph.dst_tensors(root)[0]]->get_tensor_info();
- for(unsigned int dim = 0; dim < root_info->num_dimensions(); ++dim)
- {
- if(root_info->dimension(dim) != dst_info.dimension(dim))
- {
- graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot change execution space" };
- return;
- }
- }
- }
-}
-} // namespace
-
-OpTensor::OpTensor(Id id)
- : _id{ id }
-{
-}
-
-OpTensor::Id OpTensor::id() const
-{
- return _id;
-}
-
-bool operator<(const OpTensor &t0, const OpTensor &t1)
-{
- return t0.id() < t1.id();
-}
-
-Operator::Operator(Id id)
- : _id{ id }
-{
-}
-
-Operator::Id Operator::id() const
-{
- return _id;
-}
-
-bool operator<(const Operator &op0, const Operator &op1)
-{
- return op0.id() < op1.id();
-}
-
-OperatorGraph::OperatorGraph()
- : _impl{ std::make_unique<Implementation>() }
-{
-}
-
-OperatorGraph::~OperatorGraph() = default;
-
-OperatorGraph::Implementation *OperatorGraph::impl()
-{
- return _impl.get();
-}
-
-const OperatorGraph::Implementation *OperatorGraph::impl() const
-{
- return _impl.get();
-}
-
-Status validate(const OperatorGraph &graph)
-{
- return graph.impl()->status;
-}
-
-OpTensor add_tensor(OperatorGraph &graph, ITensorInfo &info)
-{
- auto id = graph.impl()->graph.add_tensor();
- OpTensor op_tensor(id);
- graph.impl()->add_tensor(id, &info);
- return op_tensor;
-}
-
-Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor bias, OpTensor dst)
-{
- // Check if map is empty as a complex operator can only be root
- if(!graph.impl()->graph.get_root_ops().empty())
- {
- graph.impl()->status = Status{ ErrorCode::RUNTIME_ERROR, "Cannot add multiple complex operators" };
- return Operator{};
- }
-
- std::pair<Status, DependencyGraph::Id> status_id;
-
- if(bias.id() == -1)
- {
- status_id = graph.impl()->graph.add_operator({ input.id(), weights.id() }, { dst.id() });
- }
- else
- {
- status_id = graph.impl()->graph.add_operator({ input.id(), weights.id(), bias.id() }, { dst.id() });
- }
-
- check_dependency_graph_op_success(graph, status_id.first);
-
- Operator op_node(status_id.second);
-
- // Infer TensorInfo
- OpTensorContent *dst_tensor = graph.impl()->tensors[dst.id()].get();
- if(dst_tensor->get_tensor_info()->total_size() == 0)
- {
- auto src = graph.impl()->tensors[input.id()]->get_tensor_info();
- auto wts = graph.impl()->tensors[weights.id()]->get_tensor_info();
- auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wts->tensor_shape(), PadStrideInfo(desc.stride.x(), desc.stride.y(), desc.pad.left,
- desc.pad.right,
- desc.pad.top, desc.pad.bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
-
- auto_init_if_empty(*(dst_tensor->get_tensor_info()), src->clone()->set_tensor_shape(shape));
- }
-
- // Check execution space
- auto dst_info = dst_tensor->get_tensor_info();
- check_execution_shape(graph, *dst_info);
-
- ITensorDescPack<OpTensorContent> tensors;
- tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[input.id()].get());
- tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[weights.id()].get());
- if(bias.id() != -1)
- {
- tensors.add_const_tensor(ACL_SRC_2, graph.impl()->tensors[bias.id()].get());
- }
- tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
-
- graph.impl()->add_node<Conv2dContent>(status_id.second, desc, tensors);
- check_multiple_roots(graph);
-
- return op_node;
-}
-
-Operator add_op_conv2d(OperatorGraph &graph, const Conv2dDescriptor &desc, OpTensor input, OpTensor weights, OpTensor dst)
-{
- return add_op_conv2d(graph, desc, input, weights, OpTensor(-1), dst);
-}
-
-void force_conv2d_method(OperatorGraph &graph, Operator conv2d, ConvolutionMethod method)
-{
- auto node = utils::cast::polymorphic_downcast<Conv2dContent *>(graph.impl()->operators[conv2d.id()].get());
- node->set_method(method);
-}
-
-Operator add_op_elementwise_op(OperatorGraph &graph, const ElementwiseDescriptor &desc, OpTensor lhs, OpTensor rhs, OpTensor dst)
-{
- auto id = graph.impl()->graph.add_operator({ rhs.id(), lhs.id() }, { dst.id() });
- check_dependency_graph_op_success(graph, id.first);
-
- Operator op_node(id.second);
-
- // Infer TensorInfo
- auto node_lhs = graph.impl()->tensors[lhs.id()]->get_tensor_info();
- auto node_rhs = graph.impl()->tensors[rhs.id()]->get_tensor_info();
- OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get();
-
- if(node_dst->get_tensor_info()->total_size() == 0)
- {
- const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*node_rhs, *node_lhs);
- auto_init_if_empty(*(node_dst->get_tensor_info()), node_lhs->clone()->set_tensor_shape(broadcast_pair.first));
- }
-
- // Check execution space
- auto dst_info = node_dst->get_tensor_info();
- check_execution_shape(graph, *dst_info);
-
- ITensorDescPack<OpTensorContent> tensors;
- tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[lhs.id()].get());
- tensors.add_const_tensor(ACL_SRC_1, graph.impl()->tensors[rhs.id()].get());
- tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
- graph.impl()->add_node<ElementwiseContent>(id.second, desc, tensors);
- check_multiple_roots(graph);
-
- return op_node;
-}
-
-Operator add_op_floor(OperatorGraph &graph, const FloorDescriptor &desc, OpTensor src, OpTensor dst)
-{
- auto id = graph.impl()->graph.add_operator({ src.id() }, { dst.id() });
- check_dependency_graph_op_success(graph, id.first);
-
- Operator op_node(id.second);
-
- // Infer TensorInfo
- auto node_src = graph.impl()->tensors[src.id()]->get_tensor_info();
- OpTensorContent *node_dst = graph.impl()->tensors[dst.id()].get();
-
- if(node_dst->get_tensor_info()->total_size() == 0)
- {
- auto_init_if_empty(*(node_dst->get_tensor_info()), *node_src);
- }
-
- // Check execution space
- auto dst_info = node_dst->get_tensor_info();
- check_execution_shape(graph, *dst_info);
-
- ITensorDescPack<OpTensorContent> tensors;
- tensors.add_const_tensor(ACL_SRC_0, graph.impl()->tensors[src.id()].get());
- tensors.add_const_tensor(ACL_DST_0, graph.impl()->tensors[dst.id()].get());
- graph.impl()->add_node<FloorContent>(id.second, desc, tensors);
- check_multiple_roots(graph);
-
- return op_node;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp
deleted file mode 100644
index 4e57d66a1c..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> get_combinations(const std::vector<ClKernelFusionGroup *> &sorted_fgs)
-{
- ARM_COMPUTE_ERROR_ON(sorted_fgs.size() <= 1);
- std::vector<std::pair<ClKernelFusionGroup *, ClKernelFusionGroup *>> combo;
- for(size_t i = 0; i < sorted_fgs.size() - 1; ++i)
- {
- for(size_t j = i + 1; j < sorted_fgs.size(); ++j)
- {
- combo.push_back(std::make_pair(sorted_fgs.at(i), sorted_fgs.at(j)));
- }
- }
- return combo;
-}
-} // namespace
-std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group)
-{
- std::vector<const ClKernel *> kernels;
- const auto sorted = group.graph.topological_sort();
- for(const auto &pack : sorted.second)
- {
- kernels.push_back(group.fused_kernels.at(pack.op));
- }
- return kernels;
-}
-
-std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph)
-{
- std::vector<const ClKernelFusionGroup *> kernels;
- const auto sorted = graph.fg_dependency.topological_sort();
- for(const auto &pack : sorted.second)
- {
- kernels.push_back(graph.fusion_groups.at(pack.op).get());
- }
- return kernels;
-}
-
-std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph)
-{
- std::vector<ClKernelFusionGroup *> kernels;
- const auto sorted = graph.fg_dependency.topological_sort();
- for(const auto &pack : sorted.second)
- {
- kernels.push_back(graph.fusion_groups.at(pack.op).get());
- }
- return kernels;
-}
-
-std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph)
-{
- ClFusedKernelGraph fused_kernel_graph{};
- fused_kernel_graph.original_graph = &kernel_graph; // Create a copy of the original kernel graph
- fused_kernel_graph.fg_dependency = DependencyGraph();
- // Initialize all fusion groups
- for(const auto &kernel : traverse(kernel_graph))
- {
- fused_kernel_graph.add_fusion_group({ kernel });
- }
- return { Status{}, fused_kernel_graph };
-}
-
-Status fuse(ClFusedKernelGraph &fused_kernel_graph)
-{
- // A naive fusion algorithm that's guaranteed to find optimal pattern if there are no branches
- // If there are branches, the algorithm cannot guanrantee optimality as it doesn't perform any searches
-
- bool fusion_found = false;
- do
- {
- fusion_found = false;
- const auto sorted_fgs = traverse(fused_kernel_graph);
- if(sorted_fgs.size() <= 1)
- {
- // Only one or zero fusion group, thus no need to perform fusion
- return Status{};
- }
- auto fgs_combo = get_combinations(sorted_fgs);
- for(auto fgs : fgs_combo)
- {
- auto fg0 = fgs.first;
- auto fg1 = fgs.second;
- const auto st = fused_kernel_graph.can_fuse(*fg0, *fg1);
- if(bool(st))
- {
- const auto st = fused_kernel_graph.fuse(*fg0, *fg1);
- if(!bool(st))
- {
- return st;
- }
- fusion_found = true;
- break;
- }
- }
- }
- while(fusion_found);
- return Status{};
-}
-Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg)
-{
- Status st{};
- for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg.id))
- {
- const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id);
-
- /// NOTE: dst tensor must have already been added to the blueprint at this point
- ArgumentID dst_id;
- st = add_tensor(bp, dst_t->desc, dst_id, dst_t->id);
- if(!bool(st))
- {
- return st;
- }
- /// NOTE: the extra dst tensor is needed as the store kcomp requires 2 tensors. But this is irrelevant to the fused kernel graph
- /// since both tensors share the exact same info and kernel arg descriptor
- ArgumentID dst_dst_id;
- st = add_tensor(bp, dst_t->desc, dst_dst_id);
- if(!bool(st))
- {
- return st;
- }
- /// NOTE: Update the merge point map to link dst_dst_id with dst_t->id instead.
- /// This is required because the get_arguments() returned by the blueprint returns the dst tensor added by the store component
- st = update_merge_point(bp, dst_dst_id, dst_t->id);
- if(!bool(st))
- {
- return st;
- }
- st = add_kcomp_store(bp, fg.get_root_kernel()->config().store_type, dst_id, dst_dst_id);
- if(!bool(st))
- {
- return st;
- }
- }
- return st;
-}
-
-Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph)
-{
- workload.context = ctx;
- for(const auto &fg : traverse(fused_kernel_graph))
- {
- ClKernelBlueprint bp{};
- for(const auto &kernel : traverse(*fg))
- {
- const auto st = kernel->generate(bp);
- if(!bool(st))
- {
- return st;
- }
- }
- auto st = set_tile_info(bp, fg->get_root_kernel()->config().tile_desc);
- if(!bool(st))
- {
- return st;
- }
- st = generate_store(bp, fused_kernel_graph, *fg);
- if(!bool(st))
- {
- return st;
- }
-
- ClKernelCode code{};
- st = build(code, ClCodeBuilderContext{ ctx.gpu_info }, bp);
- if(!bool(st))
- {
- return st;
- }
- const auto bp_graph = get_dependency_graph(bp);
-
- // Get tensor info
- std::vector<Id> workload_src_tensors{};
- for(const auto &src_t_id : fused_kernel_graph.fg_dependency.src_tensors(fg->id))
- {
- const auto src_t = fused_kernel_graph.original_graph->get_tensor(src_t_id);
- // Get corresponding kernel arg descriptor
- const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(src_t->id));
- const auto kernel_t_id = workload.add_workload_tensor(src_t->desc, src_t->memory_type, src_t->memory_info, arg_desc, src_t->id);
- workload_src_tensors.push_back(kernel_t_id);
- }
- std::vector<Id> workload_dst_tensors{};
- for(const auto &dst_t_id : fused_kernel_graph.fg_dependency.dst_tensors(fg->id))
- {
- const auto dst_t = fused_kernel_graph.original_graph->get_tensor(dst_t_id);
- // Get corresponding kernel arg descriptor
- const auto arg_desc = code.arguments.at(bp_graph.get_merge_points().at(dst_t->id));
- const auto kernel_t_id = workload.add_workload_tensor(dst_t->desc, dst_t->memory_type, dst_t->memory_info, arg_desc, dst_t->id);
- workload_dst_tensors.push_back(kernel_t_id);
- }
-
- workload.add_unit_workload(fg->get_root_kernel()->config().stage, code, workload_src_tensors, workload_dst_tensors);
- }
-
- return Status{};
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
deleted file mode 100644
index 2051f1b62f..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/DependencyGraph.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
-#include "support/DeepCopy.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClKernelFusionGroup;
-
-/** A const view of a subgraph of the @ref ClKernelGraph to be fused together
- *
- */
-struct ClKernelFusionGroup
-{
-public:
- using Id = DependencyGraph::Id;
-
- ClKernelFusionGroup() = default;
- ClKernelFusionGroup(Id id)
- : id{ id }, graph{}, fused_kernels{}, tensors{}
- {
- }
- ~ClKernelFusionGroup() = default;
-
- void set_id(Id i)
- {
- id = i;
- }
-
- Id add_fused_kernel(const ClKernel *kernel)
- {
- /// PRE: Acyclicity ensured by DependencyGraph
- /// PRE: Connectedness ensured by DependencyGraph
- /// PRE: Single-rootedness ensured by User
- std::vector<Id> src_tensors;
- for(const auto t : kernel->tensors().get_const_src_tensors())
- {
- auto id = graph.add_tensor(t->id);
- if(tensors.find(id) == tensors.end())
- {
- tensors[id] = t;
- }
- src_tensors.push_back(id);
- }
- std::vector<Id> dst_tensors;
- for(const auto t : kernel->tensors().get_const_dst_tensors())
- {
- auto id = graph.add_tensor(t->id);
- if(tensors.find(id) == tensors.end())
- {
- tensors[id] = t;
- }
- dst_tensors.push_back(id);
- }
- auto id = graph.add_operator(src_tensors, dst_tensors);
- fused_kernels[id.second] = kernel;
- return id.second;
- }
-
- const ClKernel *get_root_kernel() const
- {
- auto root_kernels = graph.get_root_ops();
- ARM_COMPUTE_ERROR_ON(root_kernels.size() != 1);
- return fused_kernels.at(root_kernels.at(0));
- }
-
- std::vector<const ClKernelTensor *> get_src_tensors() const
- {
- std::vector<const ClKernelTensor *> src_tensors;
- for(auto tensor_id : graph.src_tensors())
- {
- src_tensors.push_back(tensors.at(tensor_id));
- }
- return src_tensors;
- }
-
- std::vector<const ClKernelTensor *> get_dst_tensors() const
- {
- std::vector<const ClKernelTensor *> dst_tensors;
- for(auto tensor_id : graph.dst_tensors())
- {
- dst_tensors.push_back(tensors.at(tensor_id));
- }
- return dst_tensors;
- }
-
- friend bool operator==(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1)
- {
- return fg0.id == fg1.id && fg0.graph == fg1.graph && fg0.fused_kernels == fg1.fused_kernels && fg0.tensors == fg1.tensors;
- }
-
- Id id{};
- DependencyGraph graph{}; // A subgraph of the original ClKernelGraph
- std::map<Id, const ClKernel *> fused_kernels{};
- std::map<Id, const ClKernelTensor *> tensors{};
-};
-
-std::vector<const ClKernel *> traverse(const ClKernelFusionGroup &group);
-
-struct ClFusedKernelGraph
-{
-public:
- using Id = DependencyGraph::Id;
-
- using KernelFusionGroupMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelFusionGroup>>;
-
- ClFusedKernelGraph() = default;
- ~ClFusedKernelGraph() = default;
- ClFusedKernelGraph(const ClFusedKernelGraph &graph) = default;
- ClFusedKernelGraph &operator=(const ClFusedKernelGraph &graph) = default;
- ClFusedKernelGraph(ClFusedKernelGraph &&graph) = default;
- ClFusedKernelGraph &operator=(ClFusedKernelGraph &&graph) = default;
-
- friend bool operator==(const ClFusedKernelGraph &graph0, const ClFusedKernelGraph &graph1)
- {
- /// NOTE: fg_dependency may change based on the order of fusion, and thus is omitted in the comparison.
- /// The fusion groups can already guarantee the equivalence of fusion
- /// In the future we may want to enforce a stronger equivalence by implementing topological comparison between @ref DependencyGraph s
- return graph0.original_graph == graph1.original_graph && graph0.fusion_groups == graph1.fusion_groups;
- }
-
- Id add_fusion_group(const std::vector<const ClKernel *> &fused_kernels)
- {
- auto fg = utils::memory::make_deep_unique<ClKernelFusionGroup, ClKernelFusionGroup>();
- for(const auto k : fused_kernels)
- {
- fg->add_fused_kernel(k);
- }
- const auto src_tensors = fg->get_src_tensors();
- const auto dst_tensors = fg->get_dst_tensors();
- std::vector<Id> inputs{};
- std::transform(std::begin(src_tensors), std::end(src_tensors), std::back_inserter(inputs), [this](auto kernel)
- {
- return fg_dependency.add_tensor(kernel->id);
- });
- std::vector<Id> outputs{};
- std::transform(std::begin(dst_tensors), std::end(dst_tensors), std::back_inserter(outputs), [this](auto kernel)
- {
- return fg_dependency.add_tensor(kernel->id);
- });
- const auto id = fg_dependency.add_operator(inputs, outputs);
- fg->set_id(id.second);
- fusion_groups[id.second] = std::move(fg);
- return id.second;
- }
-
- Status fuse(ClKernelFusionGroup &fg0, ClKernelFusionGroup &fg1)
- {
- /// PRE: Already checked by can_fuse, and thus all the INVs and ASSUMPTIONS still hold
- ClKernelFusionGroup *fg_src{};
- ClKernelFusionGroup *fg_dst{};
- // Find fg_src (parent / root) and fg_dst (child / non-root)
- if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
- {
- fg_src = &fg0;
- fg_dst = &fg1;
- }
- else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
- {
- fg_src = &fg1;
- fg_dst = &fg0;
- }
- else
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
- }
-
- for(const auto &t : fg_dependency.src_tensors(fg_dst->id))
- {
- if(!is_in(t, fg_dependency.dst_tensors(fg_src->id)))
- {
- // Link any incoming tensors of fg_dst, that ARE NOT in between fg_src and fg_dst, to fg_src
-
- // Before:
- // fg_src
- // |
- // .. t1
- // | |
- // -> fg_dst <-
- //
- // After:
- // fg_src <---t1
- //
- const auto st = link_src_tensors(fg_src->id, { t });
- if(!bool(st))
- {
- return st;
- }
- }
- else
- {
- const auto dst_fgs = fg_dependency.dst_ops_from_tensor(t);
- if(dst_fgs.size() == 1U && dst_fgs.at(0) == fg_dst->id)
- {
- // Remove any incoming tensors of fg_dst, that ARE in between fg_src and fg_dst
- // AND that are not connected to any other outgoing fgs (Note that they cannot connect to any other incoming fgs as all tensors can have at most 1 incoming fg (ASSUMPTION 3))
-
- // Before:
- // fg_src
- // |
- // t0
- // |
- // -> fg_dst
- //
- // After:
- // fg_src
- //
- const auto st = remove_fg_tensor(t);
- if(!bool(st))
- {
- return st;
- }
- }
- else
- {
- // If the tensors ARE in between fg_src and fg_dst
- // BUT have any other outgoing fgs than fg_dst, then we leave it as a dst tensor to the fused fg_src
-
- // Before:
- // fg_src
- // |
- // t0
- // |
- // |-----------
- // | |
- // -> fg_dst -> fg_other
- //
- // After:
- // fg_src
- // |
- // t0
- // |
- // -> fg_other
- //
-
- // Note that this may seem like a case we shouldn't fuse. But actually all it means is that t0 is an
- // intermediate tensor between the fused fg_src and fg_dst, but only that we also STORE it to memory
- // so that any unfused fg's (fg_other in this case) can read it.
- // So all this means that we not only can STORE the tensors at the "end" of a fusion group,
- // but also any other tensors that are not source tensors. And all tensors that are STORED (exported),
- // can be termed "dst tensors" to a fusion group
- void();
- }
- }
- }
-
- for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
- {
- // Link any outgoing tensors of fg_dst to fg_src
-
- // Before:
- // fg_src
- // |
- // ..
- // |
- // -> fg_dst
- // |
- // |--------
- // | |
- // |-> t0 |-> t1
- //
- // After:
- // fg_src
- // |
- // |--------
- // | |
- // |-> t0 |-> t1
- //
- const auto st = link_dst_tensors(fg_src->id, { t });
- if(!bool(st))
- {
- return st;
- }
- }
-
- // Merge fg_dst's graph into fg_src's graph
- for(const auto kernel : traverse(*fg_dst))
- {
- fg_src->add_fused_kernel(kernel);
- }
-
- const auto st = remove_fg(fg_dst->id);
- return st;
- }
- Status can_fuse(const ClKernelFusionGroup &fg0, const ClKernelFusionGroup &fg1) const
- {
- /// ASSUMPTION0: All tensors have 0 or 1 incoming kernel
- /// ASSUMPTION1: All kernels have exactly 1 dst tensor (Temporary, can be lifted once we start supporting multi-dst kernels)
- /// Note that this does not apply to fusion groups
- /// ASSUMPTION2: Simple kernels' tile infos can be overriden (share with) that of the root kernel's
- /// ASSUMPTION3: Extension of ASSUMPTION0: All tensors have 0 or 1 incoming fusion group
- /// INV0: All Fusion groups have a single root
- /// INV1: All Fusion groups have no cycles or loops within themselves <- guaranteed by the underlying ClKernelGraph having no cycles or loops; enforced by DependencyGraph
- /// INV2: The ClKernelFusionGroup itself has no cycles or loops <- enforced by DependencyGraph
- /// INV3: All non-roots are Simple kernels
- /// INV4: All non roots' dst tensors have the same shape as that of the root kernel
- /// INV5: All kernels within a fusion group have the same UnitWorkloadStage
- const ClKernelFusionGroup *fg_src {};
- const ClKernelFusionGroup *fg_dst{};
-
- // Check 0: Ensure fg0 and fg1 are "directly connected": one of them is a direct parent of the other
- // This guarantess INV0
- // This also finds fg_src (parent / root) and fg_dst (child / non-root)
- if(is_in(fg1.id, fg_dependency.dst_ops(fg0.id)))
- {
- fg_src = &fg0;
- fg_dst = &fg1;
- }
- else if(is_in(fg0.id, fg_dependency.dst_ops(fg1.id)))
- {
- fg_src = &fg1;
- fg_dst = &fg0;
- }
- else
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: Not directly connected fusion groups cannot be fused together" };
- }
-
- // Find unconnected tensors between fg_src and fg_dst
- std::vector<Id> unconnected_tensors{};
- for(const auto &t : fg_dependency.dst_tensors(fg_src->id))
- {
- if(!is_in(t, fg_dependency.src_tensors(fg_dst->id)))
- {
- unconnected_tensors.push_back(t);
- }
- }
-
- // Check 1: Any unconnected tensor cannot be an ancestor of fg_dst
- // This guarantees INV2: That is, the fused graph does not have any cycles or loops between different fusion groups
- for(const auto &t : unconnected_tensors)
- {
- if(fg_dependency.path_exists_from_tensor_to_op(t, fg_dst->id))
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: the fusion would result in cycles or loops" };
- }
- }
-
- // Check 2: All non-root fgs are simple. Ensure INV3
- if(fg_dst->get_root_kernel()->complexity() != Complexity::Simple)
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: only root kernel can be a complex kernel" };
- }
-
- // Check 3: All non roots' dst tensors have the same shape as that of the root kernel. Ensure INV4
- const auto root_kernel_dst_tensors = fg_dependency.dst_tensors(fg_src->id);
- ARM_COMPUTE_ERROR_ON(root_kernel_dst_tensors.size() != 1); // (ASSUMPTION 1: All kernels have exactly 1 dst tensor)
- const auto root_kernel_dst_tensor_info = original_graph->get_tensor(root_kernel_dst_tensors[0])->desc;
-
- for(const auto &t : fg_dependency.dst_tensors(fg_dst->id))
- {
- const auto t_info = original_graph->get_tensor(t)->desc;
- if(detail::have_different_dimensions(root_kernel_dst_tensor_info->tensor_shape(), t_info->tensor_shape(), 0))
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all non roots' dst tensors should have the same shape as that of the root kernel" };
- }
- }
-
- // Check 4: All kernels within a fg have the same UnitWorkloadStage. Ensure INV5
- if(!(fg_src->get_root_kernel()->config().stage == fg_dst->get_root_kernel()->config().stage))
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Invalid fusion: all kernels within a fusion group should have the same UnitWorkloadStage" };
- }
-
- return Status{};
- }
-
- const ClKernelGraph *original_graph{};
- DependencyGraph fg_dependency{};
- KernelFusionGroupMap fusion_groups{};
- // Note: no need to store tensors pointers in the ClFusedKernelGraph, as they are stored in side the individual fusion groups.
-
-private:
- Status link_src_tensors(Id fg, const std::vector<Id> &src_tensors)
- {
- for(auto t : src_tensors)
- {
- fg_dependency.link_input(fg, t);
- }
- return Status{};
- }
- Status link_dst_tensors(Id fg, const std::vector<Id> &dst_tensors)
- {
- for(auto t : dst_tensors)
- {
- fg_dependency.link_output(fg, t);
- }
- return Status{};
- }
- Status remove_fg(Id fg)
- {
- fg_dependency.remove_operator(fg);
- fusion_groups.erase(fg);
- return Status{};
- }
- Status remove_fg_tensor(Id tensor)
- {
- fg_dependency.remove_tensor(tensor);
- return Status{};
- }
-};
-
-std::vector<const ClKernelFusionGroup *> traverse(const ClFusedKernelGraph &graph);
-std::vector<ClKernelFusionGroup *> traverse(ClFusedKernelGraph &graph);
-
-std::pair<Status, ClFusedKernelGraph> init_fusion_graph(const ClKernelGraph &kernel_graph);
-
-Status fuse(ClFusedKernelGraph &fused_kernel_graph);
-
-Status generate_store(ClKernelBlueprint &bp, const ClFusedKernelGraph &fused_kernel_graph, const ClKernelFusionGroup &fg);
-
-Status generate(ClWorkload &workload, const ClWorkloadContext &ctx, const ClFusedKernelGraph &fused_kernel_graph);
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLFUSEDKERNELGRAPH_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
deleted file mode 100644
index f10e97e3e9..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
-
-#include "arm_compute/core/experimental/OperatorGraph.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClDirectConv2dKernelDescriptor
-{
- friend bool operator==(const ClDirectConv2dKernelDescriptor &desc0, const ClDirectConv2dKernelDescriptor &desc1)
- {
- return desc0.conv2d == desc1.conv2d;
- }
- Conv2dDescriptor conv2d{};
-};
-
-struct ClElementwiseKernelDescriptor
-{
- friend bool operator==(const ClElementwiseKernelDescriptor &desc0, const ClElementwiseKernelDescriptor &desc1)
- {
- return desc0.eltwise == desc1.eltwise;
- }
- ElementwiseDescriptor eltwise{};
-};
-
-struct ClFloorKernelDescriptor
-{
- friend bool operator==(const ClFloorKernelDescriptor &desc0, const ClFloorKernelDescriptor &desc1)
- {
- return desc0.floor == desc1.floor;
- }
- FloorDescriptor floor{};
-};
-
-struct ClActivationKernelDescriptor
-{
- friend bool operator==(const ClActivationKernelDescriptor &, const ClActivationKernelDescriptor &)
- {
- return true;
- }
-};
-
-enum class ClippingStrategy
-{
- TOP_LEFT,
- TOP_RIGHT,
- BOTTOM_LEFT,
- BOTTOM_RIGHT,
-};
-/** Component: Store */
-struct TileDescriptor
-{
- Size2D tile_dims{};
- Size2D boundaries{};
- ClippingStrategy clipping{ ClippingStrategy::TOP_LEFT };
-
- TileDescriptor()
- {
- }
-
- TileDescriptor(Size2D dims, const Size2D &bound, const ClippingStrategy &clip)
- : tile_dims(dims), boundaries(bound), clipping(clip)
- {
- }
-
- bool empty() const
- {
- return (tile_dims.area() == 0) || (boundaries.area() == 0);
- }
- friend bool operator==(const TileDescriptor &tile0, const TileDescriptor &tile1)
- {
- return tile0.tile_dims == tile1.tile_dims && tile0.boundaries == tile1.boundaries && tile0.clipping == tile1.clipping;
- }
-};
-enum class StoreType
-{
- VStore,
- VStorePartial,
- StoreRow,
- ConvertStoreRow,
- StoreBlock,
- ConvertStoreBlock,
- StoreRowPartial,
- StoreBlockPartial,
- StoreBlockBoundaryAware,
- StoreVectorSelect,
- TStoreIndirectWidthSelect
-};
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELDESCRIPTORS_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
deleted file mode 100644
index cab51a2ce6..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "src/core/CL/CLValidate.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
-
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-Status ClDirectConv2dKernel::generate(ClKernelBlueprint &bp) const
-{
- const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
- const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst);
- ArgumentID input_id;
- add_tensor(bp, input->desc, input_id, input->id);
- ArgumentID weight_id;
- add_tensor(bp, weight->desc, weight_id, weight->id);
- ArgumentID bias_id = g_arg_placeholder;
- if(bias != nullptr)
- {
- add_tensor(bp, bias->desc, bias_id, bias->id);
- }
- ArgumentID dst_id;
- add_tensor(bp, dst->desc, dst_id, dst->id);
-
- add_kcomp_direct_conv2d(bp, desc, input_id, weight_id, bias_id, dst_id);
- return Status{};
-}
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc)
-{
- // 1. Check validity
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- // Matching data type
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
- }
-
- // Matching data layout
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, biases);
- }
-
- // All tensor infos are initialized
- ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(biases->tensor_shape().total_size() == 0);
- }
- // Device requirements are met
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
- // weights shape is correct
- const DataLayout data_layout = src->data_layout();
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-
- // dst shape is correct
- PadStrideInfo legacy_pad_stride(conv2d_desc.conv2d.stride.x(), conv2d_desc.conv2d.stride.y(), conv2d_desc.conv2d.pad.left, conv2d_desc.conv2d.pad.right, conv2d_desc.conv2d.pad.top,
- conv2d_desc.conv2d.pad.bottom, DimensionRoundingType{});
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
- misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride));
-
- // biases shape is correct
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
- "Biases size and number of dst feature maps should match");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
- "Biases should be one dimensional");
- }
-
- // 2. Check support level
- // Data type
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
- // Data layout
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-
- return Status{};
-}
-
-bool ClDirectConv2dKernel::operator==(const ClKernel &other) const
-{
- const auto converted = *utils::cast::polymorphic_downcast<const ClDirectConv2dKernel *>(&other);
- return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
-}
-
-Status ClElementwiseKernel::generate(ClKernelBlueprint &bp) const
-{
- const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
- ArgumentID lhs_id;
- add_tensor(bp, lhs->desc, lhs_id, lhs->id);
- ArgumentID rhs_id;
- add_tensor(bp, rhs->desc, rhs_id, rhs->id);
- ArgumentID dst_id;
- add_tensor(bp, dst->desc, dst_id, dst->id);
-
- add_kcomp_eltwise_op(bp, desc, lhs_id, rhs_id, dst_id);
- return Status{};
-}
-
-Status ClElementwiseKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst)
-{
- // 1. Check validity
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
-
- // Matching data type
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
-
- // Matching data layout
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst);
-
- // All tensor infos are initialized
- ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-
- // Device requirements are met
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs);
-
- const bool in_place = (lhs == dst) || (rhs == dst);
- const bool src0_in_place = in_place && (lhs == dst);
-
- // dst shape is correct
- const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
- if(in_place)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src0_in_place ? lhs->tensor_shape() : rhs->tensor_shape(), 0),
- "Wrong shape for dst, cannot do in_place calculation");
- }
-
- // 2. Check support level
-
- // Data type
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
-
- // Data layout
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(lhs, DataLayout::NHWC);
-
- return Status{};
-}
-
-bool ClElementwiseKernel::operator==(const ClKernel &other) const
-{
- const auto converted = *utils::cast::polymorphic_downcast<const ClElementwiseKernel *>(&other);
- return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
-}
-
-Status ClFloorKernel::generate(ClKernelBlueprint &bp) const
-{
- const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ArgumentID src_id;
- add_tensor(bp, src->desc, src_id, src->id);
- ArgumentID dst_id;
- add_tensor(bp, dst->desc, dst_id, dst->id);
-
- add_kcomp_floor(bp, desc, src_id, dst_id);
- return Status{};
-}
-
-Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- // 1. Check validity
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
- // Matching data type
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
- // Matching data layout
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-
- // All tensor infos are initialized
- ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-
- // Device requirements are met
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-
- // dst shape is correct
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(src->tensor_shape(), dst->tensor_shape(), 0), "Wrong shape for dst");
-
- // 2. Check support level
-
- // Data type
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
-
- // Data layout
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-
- return Status{};
-}
-
-bool ClFloorKernel::operator==(const ClKernel &other) const
-{
- const auto converted = *utils::cast::polymorphic_downcast<const ClFloorKernel *>(&other);
- return config() == other.config() && tensors() == other.tensors() && desc == converted.desc;
-}
-
-std::vector<const ClKernel *> traverse(const ClKernelGraph &graph)
-{
- std::vector<const ClKernel *> kernels;
- const auto sorted = graph.graph.topological_sort();
- for(const auto &pack : sorted.second)
- {
- kernels.push_back(graph.kernels.at(pack.op).get());
- }
- return kernels;
-}
-
-std::vector<ClKernel *> traverse(ClKernelGraph &graph)
-{
- std::vector<ClKernel *> kernels;
- const auto sorted = graph.graph.topological_sort();
- for(const auto &pack : sorted.second)
- {
- kernels.push_back(graph.kernels.at(pack.op).get());
- }
- return kernels;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
deleted file mode 100644
index c3580cfaca..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "arm_compute/core/experimental/DependencyGraph.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h"
-#include "support/DeepCopy.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClKernelGraph;
-class ClKernelBlueprint;
-
-enum class Complexity
-{
- Simple,
- Complex
-};
-
-/** Configurations for ClKernel
- *
- */
-struct ClKernelConfig
-{
- UnitWorkloadStage stage{};
- TileDescriptor tile_desc{};
- StoreType store_type{};
- friend bool operator==(const ClKernelConfig &config0, const ClKernelConfig &config1)
- {
- return config0.stage == config1.stage && config0.tile_desc == config1.tile_desc && config0.store_type == config1.store_type;
- }
-};
-
-struct ClKernelTensor
-{
-public:
- using Id = DependencyGraph::Id;
- ClKernelTensor() = default;
- ClKernelTensor(Id id, ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info)
- : id{ id }, desc{ desc }, memory_type{ memory_type }, memory_info{ memory_info }
- {
- }
- bool operator==(const ClKernelTensor &other) const
- {
- return desc == other.desc;
- }
-
- Id id{};
- ITensorInfo *desc{};
- MemoryType memory_type{};
- AuxMemoryInfo memory_info{};
-};
-
-struct ClKernel
-{
-public:
- using Id = DependencyGraph::Id;
- ClKernel() = default;
- virtual ~ClKernel() = default;
- ClKernel(const ClKernel &kernel) = default;
- ClKernel &operator=(const ClKernel &kernel) = default;
- ClKernel(ClKernel &&kernel) = default;
- ClKernel &operator=(ClKernel &&kernel) = default;
- ClKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ITensorDescPack<ClKernelTensor> &tensors)
- : _graph{ graph }, _id{ id }, _config{ config }, _tensors{ tensors }
- {
- }
- virtual bool operator==(const ClKernel &other) const = 0;
- virtual Complexity complexity() const = 0;
- virtual Status generate(ClKernelBlueprint &bp) const = 0;
- Id id() const
- {
- return _id;
- }
- ITensorDescPack<ClKernelTensor> tensors() const
- {
- return _tensors;
- }
- ClKernelConfig config() const
- {
- return _config;
- }
-
-protected:
- const ClKernelGraph *_graph {};
- Id _id{};
- ClKernelConfig _config{};
- ITensorDescPack<ClKernelTensor> _tensors{};
-};
-
-struct ClDirectConv2dKernel : public ClKernel
-{
-public:
- Complexity complexity() const override
- {
- return Complexity::Complex;
- }
- ClDirectConv2dKernel() = default;
- ~ClDirectConv2dKernel() override = default;
- ClDirectConv2dKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig config, const ClDirectConv2dKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
- : ClKernel{ graph, id, config, tensors }, desc{ desc }
- {
- }
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ClDirectConv2dKernelDescriptor &conv2d_desc);
- bool operator==(const ClKernel &other) const override;
- Status generate(ClKernelBlueprint &bp) const override;
-
- ClDirectConv2dKernelDescriptor desc{};
-};
-
-struct ClElementwiseKernel : public ClKernel
-{
-public:
- Complexity complexity() const override
- {
- return Complexity::Simple;
- }
- ClElementwiseKernel() = default;
- ~ClElementwiseKernel() override = default;
- ClElementwiseKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClElementwiseKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
- : ClKernel{ graph, id, config, tensors }, desc{ desc }
- {
- }
- static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst);
- bool operator==(const ClKernel &other) const override;
- Status generate(ClKernelBlueprint &bp) const override;
-
- ClElementwiseKernelDescriptor desc{};
-};
-
-struct ClFloorKernel : public ClKernel
-{
-public:
- Complexity complexity() const override
- {
- return Complexity::Simple;
- }
- ClFloorKernel() = default;
- ~ClFloorKernel() override = default;
- ClFloorKernel(const ClKernelGraph *graph, Id id, const ClKernelConfig &config, const ClFloorKernelDescriptor &desc, const ITensorDescPack<ClKernelTensor> tensors)
- : ClKernel{ graph, id, config, tensors }, desc{ desc }
- {
- }
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
- bool operator==(const ClKernel &other) const override;
- Status generate(ClKernelBlueprint &bp) const override;
-
- ClFloorKernelDescriptor desc{};
-};
-
-struct ClKernelGraph
-{
-public:
- using Id = DependencyGraph::Id;
- using KernelMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernel>>;
- using KernelTensorMap = std::map<Id, utils::memory::deep_unique_ptr<ClKernelTensor>>;
-
- ClKernelGraph() = default;
- ~ClKernelGraph() = default;
-
- friend bool operator==(const ClKernelGraph &graph0, const ClKernelGraph &graph1)
- {
- return graph0.graph == graph1.graph && graph0.kernels == graph1.kernels && graph0.tensors == graph1.tensors;
- }
-
- Status add_kernel_tensor(ITensorInfo *desc, MemoryType memory_type, const AuxMemoryInfo &memory_info, Id &tensor_id, Id merge_point = DependencyGraph::empty_id())
- {
- tensor_id = graph.add_tensor(merge_point);
- if(tensors.find(tensor_id) == tensors.end())
- {
- tensors[tensor_id] = utils::memory::make_deep_unique<ClKernelTensor, ClKernelTensor>(tensor_id, desc, memory_type, memory_info);
- }
- return Status{};
- }
-
- template <typename ContentT, typename KernelDescT>
- Status add_kernel(const ClKernelConfig &config, const KernelDescT &desc, const ITensorDescPack<ClKernelTensor> &tensors, Id &kernel_id)
- {
- const auto src_tensors = tensors.get_const_src_tensors();
- const auto dst_tensors = tensors.get_const_dst_tensors();
- std::vector<Id> src_tensor_ids{};
- std::vector<Id> dst_tensor_ids{};
- for(const auto &t : src_tensors)
- {
- src_tensor_ids.push_back(t->id);
- }
- for(const auto &t : dst_tensors)
- {
- dst_tensor_ids.push_back(t->id);
- }
- kernel_id = graph.add_operator(src_tensor_ids, dst_tensor_ids).second;
- auto k = utils::memory::make_deep_unique<ClKernel, ContentT>(this, kernel_id, config, desc, tensors);
- kernels[kernel_id] = std::move(k);
- return Status{};
- }
-
- ClKernel *get_kernel(Id id)
- {
- return kernels.at(id).get();
- }
- const ClKernel *get_kernel(Id id) const
- {
- return kernels.at(id).get();
- }
-
- ClKernelTensor *get_tensor(Id id)
- {
- return tensors.at(id).get();
- }
- const ClKernelTensor *get_tensor(Id id) const
- {
- return tensors.at(id).get();
- }
-
- DependencyGraph graph{};
- KernelMap kernels{};
- KernelTensorMap tensors{};
-};
-using Id = DependencyGraph::Id;
-
-std::vector<const ClKernel *> traverse(const ClKernelGraph &graph);
-std::vector<ClKernel *> traverse(ClKernelGraph &graph);
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLKERNELGRAPH_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp
deleted file mode 100644
index dcada4f64b..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ClWorkload.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClFusedKernelGraph.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-Status build(ClWorkload &workload, const OperatorGraph &op_graph, const ClWorkloadContext &ctx)
-{
- workload.context = ctx;
- ClKernelGraph kernel_graph;
- workload.status = validate(op_graph);
- ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
- workload.status = translate(kernel_graph, *op_graph.impl());
- ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
- ClFusedKernelGraph fused_k_graph;
- std::tie(workload.status, fused_k_graph) = init_fusion_graph(kernel_graph);
- ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
- workload.status = fuse(fused_k_graph);
- ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
- workload.status = generate(workload, ctx, fused_k_graph);
- ARM_COMPUTE_RETURN_ON_ERROR(workload.status);
-
- // Get operator tensor id to workload tensor id map
- const auto op_tensor_to_kernel_tensor = fused_k_graph.original_graph->graph.get_merge_points();
- const auto kernel_tensor_to_workload_tensor = workload.graph.get_merge_points();
- for(const auto op_t : op_graph.impl()->graph.src_tensors())
- {
- const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t);
- const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t);
- workload.op_tensor_id_lut[workload_t] = op_t;
- }
- for(const auto op_t : op_graph.impl()->graph.dst_tensors())
- {
- const auto kernel_t = op_tensor_to_kernel_tensor.at(op_t);
- const auto workload_t = kernel_tensor_to_workload_tensor.at(kernel_t);
- workload.op_tensor_id_lut[workload_t] = op_t;
- }
- return workload.status;
-}
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp
deleted file mode 100644
index 7350255ebe..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/DependencyGraph.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/experimental/DependencyGraph.h"
-
-#include <algorithm>
-#include <deque>
-#include <set>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-DependencyGraph::DependencyGraph(const AdjList &adj_src_tensors, const AdjList &adj_dst_tensors, const AdjList &adj_src_ops, const AdjList &adj_dst_ops, std::map<Id, Id> merge_points)
- : _adj_src_tensors{ adj_src_tensors }, _adj_dst_tensors{ adj_dst_tensors }, _adj_src_ops{ adj_src_ops }, _adj_dst_ops{ adj_dst_ops }, _merge_to_internal{ merge_points }, _operator_id{}, _tensor_id{}
-{
-}
-DependencyGraph::DependencyGraph(const std::vector<Id> &imported_tensors)
- : _adj_src_tensors{}, _adj_dst_tensors{}, _adj_src_ops{}, _adj_dst_ops{}, _merge_to_internal{}, _operator_id{}, _tensor_id{}
-{
- for(auto t : imported_tensors)
- {
- _adj_src_ops[t] = {};
- _adj_dst_ops[t] = {};
- }
-}
-
-Status DependencyGraph::update_merge_point(Id t_id, Id merge_point)
-{
- if(_merge_to_internal.find(merge_point) == _merge_to_internal.end())
- {
- return Status{ ErrorCode::RUNTIME_ERROR, "Merge point does not exist" };
- }
- _merge_to_internal[merge_point] = t_id;
- return Status{};
-}
-
-DependencyGraph::Id DependencyGraph::add_tensor(Id merge_tensor)
-{
- Id new_tensor{ empty_id() };
- if(merge_tensor != empty_id())
- {
- if(_merge_to_internal.find(merge_tensor) != _merge_to_internal.end())
- {
- new_tensor = _merge_to_internal[merge_tensor];
- }
- else
- {
- new_tensor = insert_new_tensor();
- _merge_to_internal[merge_tensor] = new_tensor;
- }
- }
- else
- {
- new_tensor = insert_new_tensor();
- }
- return new_tensor;
-}
-
-void DependencyGraph::remove_tensor(Id tensor)
-{
- for(auto src_op : _adj_src_ops.at(tensor))
- {
- auto &dst_tensors = _adj_dst_tensors.at(src_op);
- dst_tensors.erase(
- std::remove(std::begin(dst_tensors), std::end(dst_tensors), tensor),
- std::end(dst_tensors));
- }
- for(auto dst_op : _adj_dst_ops.at(tensor))
- {
- auto &src_tensors = _adj_src_tensors.at(dst_op);
- src_tensors.erase(
- std::remove(std::begin(src_tensors), std::end(src_tensors), tensor),
- std::end(src_tensors));
- }
- _adj_src_ops.erase(tensor);
- _adj_dst_ops.erase(tensor);
-}
-
-std::pair<Status, DependencyGraph::Id> DependencyGraph::add_operator(const std::vector<Id> &inputs, const std::vector<Id> &outputs)
-{
- Id new_op = insert_new_op();
- for(Id tensor : inputs)
- {
- link_input(new_op, tensor);
- }
- for(Id tensor : outputs)
- {
- link_output(new_op, tensor);
- }
-
- // Use topological sort in order to detect possible loops / cycles.
- // NOTE: This is unscalable. We'll need to have a better way of detecting loops or relax this invariant during operation, and add a validate method instead
- return std::pair<Status, DependencyGraph::Id>(topological_sort().first, new_op);
-}
-
-void DependencyGraph::remove_operator(Id op)
-{
- for(auto src_tensor : _adj_src_tensors.at(op))
- {
- auto &dst_ops = _adj_dst_ops.at(src_tensor);
- dst_ops.erase(
- std::remove(std::begin(dst_ops), std::end(dst_ops), op),
- std::end(dst_ops));
- }
- for(auto dst_tensor : _adj_dst_tensors.at(op))
- {
- auto &src_ops = _adj_src_ops.at(dst_tensor);
- src_ops.erase(
- std::remove(std::begin(src_ops), std::end(src_ops), op),
- std::end(src_ops));
- }
- _adj_src_tensors.erase(op);
- _adj_dst_tensors.erase(op);
-}
-
-std::map<DependencyGraph::Id, DependencyGraph::Id> DependencyGraph::get_merge_points() const
-{
- return _merge_to_internal;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::get_root_ops() const
-{
- std::vector<Id> ops{};
- const auto op_list = all_ops();
-
- for(auto op : op_list)
- {
- if(src_ops(op).empty())
- {
- ops.emplace_back(op);
- }
- }
- return ops;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::get_dst_ops() const
-{
- std::vector<Id> ops{};
- const auto op_list = all_ops();
-
- for(auto op : op_list)
- {
- if(dst_ops(op).empty())
- {
- ops.emplace_back(op);
- }
- }
- return ops;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::src_tensors(Id op) const
-{
- ARM_COMPUTE_ERROR_ON(!operator_exists(op));
- return _adj_src_tensors.at(op);
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors(Id op) const
-{
- ARM_COMPUTE_ERROR_ON(!operator_exists(op));
- return _adj_dst_tensors.at(op);
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::src_tensors() const
-{
- std::vector<Id> tensors;
- for(auto tensor_src_ops : _adj_src_ops)
- {
- if(tensor_src_ops.second.empty())
- tensors.push_back(tensor_src_ops.first);
- }
- return tensors;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::dst_tensors() const
-{
- std::vector<Id> tensors;
- for(auto tensor_dst_ops : _adj_dst_ops)
- {
- if(tensor_dst_ops.second.empty())
- tensors.push_back(tensor_dst_ops.first);
- }
- return tensors;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::src_ops_from_tensor(Id tensor) const
-{
- return _adj_src_ops.at(tensor);
-}
-std::vector<DependencyGraph::Id> DependencyGraph::dst_ops_from_tensor(Id tensor) const
-{
- return _adj_dst_ops.at(tensor);
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::all_ops() const
-{
- std::vector<Id> ops{};
- std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
- {
- return it.first;
- });
- return ops;
-}
-
-bool DependencyGraph::path_exists_from_tensor_to_op(Id src_tensor, Id dst_op) const
-{
- for(auto child_op : dst_ops_from_tensor(src_tensor))
- {
- if(path_exists_from_op_to_op(child_op, dst_op))
- {
- return true;
- }
- }
- return false;
-}
-
-bool DependencyGraph::path_exists_from_op_to_op(Id src_op, Id dst_op) const
-{
- if(src_op == dst_op)
- {
- return true;
- }
- if(is_in(src_op, get_dst_ops()))
- {
- return false;
- }
- for(auto child_tensor : dst_tensors(src_op))
- {
- if(path_exists_from_tensor_to_op(child_tensor, dst_op))
- {
- return true;
- }
- }
- return false;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::all_tensors() const
-{
- std::vector<Id> tensors{};
- std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
- {
- return it.first;
- });
- return tensors;
-}
-
-unsigned int DependencyGraph::number_of_ops() const
-{
- return _adj_src_tensors.size();
-}
-
-unsigned int DependencyGraph::number_of_tensors() const
-{
- return _adj_src_ops.size();
-}
-
-DependencyGraph::Id DependencyGraph::insert_new_tensor()
-{
- Id new_tensor = _tensor_id.alloc();
- _adj_src_ops[new_tensor] = {};
- _adj_dst_ops[new_tensor] = {};
- return new_tensor;
-}
-DependencyGraph::Id DependencyGraph::insert_new_op()
-{
- Id new_op = _operator_id.alloc();
- _adj_src_tensors[new_op] = {};
- _adj_dst_tensors[new_op] = {};
- return new_op;
-}
-void DependencyGraph::link_input(Id op, Id in_tensor)
-{
- ARM_COMPUTE_ERROR_ON(!operator_exists(op));
- ARM_COMPUTE_ERROR_ON(!tensor_exists(in_tensor));
- ARM_COMPUTE_ERROR_ON(are_connected(op, in_tensor));
- _adj_src_tensors[op].push_back(in_tensor);
- _adj_dst_ops[in_tensor].push_back(op);
-}
-void DependencyGraph::link_output(Id op, Id out_tensor)
-{
- ARM_COMPUTE_ERROR_ON(!operator_exists(op));
- ARM_COMPUTE_ERROR_ON(!tensor_exists(out_tensor));
- ARM_COMPUTE_ERROR_ON(are_connected(op, out_tensor));
- _adj_dst_tensors[op].push_back(out_tensor);
- _adj_src_ops[out_tensor].push_back(op);
-}
-bool DependencyGraph::tensor_exists(Id tensor) const
-{
- return _adj_src_ops.find(tensor) != _adj_src_ops.end() && _adj_dst_ops.find(tensor) != _adj_dst_ops.end();
-}
-bool DependencyGraph::operator_exists(Id op) const
-{
- return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
-}
-
-bool DependencyGraph::is_src_tensor(Id tensor) const
-{
- if(!tensor_exists(tensor))
- {
- return false;
- }
- return _adj_src_ops.at(tensor).empty();
-}
-
-bool DependencyGraph::is_dst_tensor(Id tensor) const
-{
- if(!tensor_exists(tensor))
- {
- return false;
- }
- return _adj_dst_ops.at(tensor).empty();
-}
-bool DependencyGraph::is_src_tensor_of(Id op, Id tensor) const
-{
- if(!operator_exists(op) || !tensor_exists(tensor))
- {
- return false;
- }
- const auto op_inputs = src_tensors(op);
- return std::find(op_inputs.begin(), op_inputs.end(), tensor) != op_inputs.end();
-}
-bool DependencyGraph::is_dst_tensor_of(Id op, Id tensor) const
-{
- if(!operator_exists(op) || !tensor_exists(tensor))
- {
- return false;
- }
- const auto op_outputs = dst_tensors(op);
- return std::find(op_outputs.begin(), op_outputs.end(), tensor) != op_outputs.end();
-}
-bool DependencyGraph::are_connected(Id op, Id tensor) const
-{
- return is_src_tensor_of(op, tensor) || is_dst_tensor_of(op, tensor);
-}
-std::vector<DependencyGraph::Id> DependencyGraph::src_ops(Id op) const
-{
- ARM_COMPUTE_ERROR_ON(!operator_exists(op));
- std::vector<Id> ops{};
- for(Id src_tensor : src_tensors(op))
- {
- ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
- }
- return ops;
-}
-
-std::vector<DependencyGraph::Id> DependencyGraph::dst_ops(Id op) const
-{
- ARM_COMPUTE_ERROR_ON(!operator_exists(op));
- std::vector<Id> ops{};
- for(Id dst_tensor : _adj_dst_tensors.at(op))
- {
- ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
- }
- return ops;
-}
-
-std::pair<Status, std::vector<DependencyGraph::OpPack>> DependencyGraph::topological_sort() const
-{
- // Incident degree (number of source operators to an op)
- std::map<Id, unsigned int> in_degree{};
- std::set<Id> visited_ops{};
- std::deque<Id> zero_in_degree_ops{};
- std::vector<OpPack> sorted_op_packs{};
- for(auto op : all_ops())
- {
- const auto degree = src_ops(op).size();
- in_degree[op] = degree;
- if(degree == 0)
- {
- zero_in_degree_ops.push_back(op);
- visited_ops.insert(op);
- }
- }
-
- while(!zero_in_degree_ops.empty())
- {
- const Id op = zero_in_degree_ops.front();
- zero_in_degree_ops.pop_front();
- sorted_op_packs.push_back(OpPack{ op, src_tensors(op), dst_tensors(op) });
-
- for(const auto next_op : dst_ops(op))
- {
- if(in_degree[next_op] > 0)
- {
- in_degree[next_op]--;
- }
- if(in_degree[next_op] == 0 && visited_ops.find(next_op) == visited_ops.end())
- {
- zero_in_degree_ops.push_back(next_op);
- visited_ops.insert(op);
- }
- }
- }
-
- // If there are remaining ops with in_degree > 0, then it's indication that there are cycles in the graph
- Status st{};
- if(sorted_op_packs.size() != number_of_ops())
- {
- st = Status{ ErrorCode::RUNTIME_ERROR, "Cycles or loops are not allowed in a DependencyGraph" };
- }
- return std::make_pair(st, sorted_op_packs);
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h
deleted file mode 100644
index a4e4eaa3bb..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
-
-#include <cstddef>
-#include <unordered_map>
-#include <vector>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-template <typename TDesc>
-class ITensorDescPack
-{
-public:
- struct PackElement
- {
- PackElement() = default;
- ~PackElement() = default;
- PackElement(const PackElement &) = default;
- PackElement &operator=(const PackElement &) = default;
- PackElement(PackElement &&) = default;
- PackElement &operator=(PackElement &&) = default;
- PackElement(int id, TDesc *tensor)
- : id(id), tensor(tensor), ctensor(nullptr)
- {
- }
- PackElement(int id, const TDesc *ctensor)
- : id(id), tensor(nullptr), ctensor(ctensor)
- {
- }
-
- int id{ -1 };
- TDesc *tensor{ nullptr };
- const TDesc *ctensor{ nullptr };
-
- friend bool operator==(const PackElement &elem0, const PackElement &elem1)
- {
- const bool same_ctensor = (elem0.tensor == nullptr && elem1.tensor == nullptr && elem0.ctensor != nullptr && elem1.ctensor != nullptr && *elem0.ctensor == *elem1.ctensor);
- const bool same_tensor = (elem0.ctensor == nullptr && elem1.ctensor == nullptr && elem0.tensor != nullptr && elem1.tensor != nullptr && *elem0.tensor == *elem1.tensor);
-
- return elem0.id == elem1.id && (same_ctensor || same_tensor);
- }
- };
-
-public:
- /** Default Constructor */
- ITensorDescPack() = default;
- ~ITensorDescPack() = default;
- ITensorDescPack<TDesc>(const ITensorDescPack<TDesc> &other) = default;
- ITensorDescPack<TDesc> &operator=(const ITensorDescPack<TDesc> &other) = default;
- ITensorDescPack<TDesc>(ITensorDescPack<TDesc> &&other) = default;
- ITensorDescPack<TDesc> &operator=(ITensorDescPack<TDesc> &&other) = default;
- /** Initializer list Constructor */
- ITensorDescPack(std::initializer_list<PackElement> l)
- : _pack{}
- {
- for(auto &e : l)
- {
- _pack[e.id] = e;
- }
- }
- /** Add tensor to the pack
- *
- * @param[in] id ID/type of the tensor to add
- * @param[in] tensor Tensor to add
- */
- void add_tensor(int id, TDesc *tensor)
- {
- _pack[id] = PackElement(id, tensor);
- }
-
- /** Add const tensor to the pack
- *
- * @param[in] id ID/type of the tensor to add
- * @param[in] tensor Tensor to add
- */
- void add_const_tensor(int id, const TDesc *tensor)
- {
- _pack[id] = PackElement(id, tensor);
- }
- /** Get tensor of a given id from the pac
- *
- * @param[in] id ID of tensor to extract
- *
- * @return The pointer to the tensor if exist and is non-const else nullptr
- */
- TDesc *get_tensor(int id)
- {
- auto it = _pack.find(id);
- return it != _pack.end() ? it->second.tensor : nullptr;
- }
- /** Get constant tensor of a given id
- *
- * @param[in] id ID of tensor to extract
- *
- * @return The pointer to the tensor if exist and is const else nullptr
- */
- const TDesc *get_const_tensor(int id) const
- {
- auto it = _pack.find(id);
- if(it != _pack.end())
- {
- return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
- }
- return nullptr;
- }
- /** Remove the tensor stored with the given id
- *
- * @param[in] id ID of tensor to remove
- */
- void remove_tensor(int id)
- {
- _pack.erase(id);
- }
- /** Pack size accessor
- *
- * @return Number of tensors registered to the pack
- */
- size_t size() const
- {
- return _pack.size();
- }
- /** Checks if pack is empty
- *
- * @return True if empty else false
- */
- bool empty() const
- {
- return _pack.empty();
- }
-
- /** Get the ACL_SRC_* tensors
- *
- * @return std::vector<TDesc *>
- */
- std::vector<TDesc *> get_src_tensors()
- {
- std::vector<TDesc *> src_tensors{};
- for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
- {
- auto tensor = get_tensor(id);
- if(tensor != nullptr)
- {
- src_tensors.push_back(tensor);
- }
- }
- return src_tensors;
- }
- /** Get the const ACL_SRC_* tensors
- *
- * @return std::vector<const TDesc *>
- */
- std::vector<const TDesc *> get_const_src_tensors() const
- {
- std::vector<const TDesc *> src_tensors{};
- for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
- {
- auto tensor = get_const_tensor(id);
- if(tensor != nullptr)
- {
- src_tensors.push_back(tensor);
- }
- }
- return src_tensors;
- }
- /** Get the ACL_DST_* tensors
- *
- * @return std::vector<TDesc *>
- */
- std::vector<TDesc *> get_dst_tensors()
- {
- std::vector<TDesc *> dst_tensors{};
- for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
- {
- auto tensor = get_tensor(id);
- if(tensor != nullptr)
- {
- dst_tensors.push_back(tensor);
- }
- }
- return dst_tensors;
- }
- /** Get the const ACL_DST_* tensors
- *
- * @return std::vector<const TDesc *>
- */
- std::vector<const TDesc *> get_const_dst_tensors() const
- {
- std::vector<const TDesc *> dst_tensors{};
- for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
- {
- auto tensor = get_const_tensor(id);
- if(tensor != nullptr)
- {
- dst_tensors.push_back(tensor);
- }
- }
- return dst_tensors;
- }
-
- friend bool operator==(const ITensorDescPack<TDesc> &pack0, const ITensorDescPack<TDesc> &pack1)
- {
- return pack0._pack == pack1._pack;
- }
-
-private:
- std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_ITENSORDESCPACK_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
deleted file mode 100644
index 663b89e235..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.cpp
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelGraph.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, MemoryType memory_type, AuxMemoryInfo memory_info,
- DependencyGraph::Id &id)
-{
- ARM_COMPUTE_UNUSED(op_graph);
- return k_graph.add_kernel_tensor(op_tensor.desc, memory_type, memory_info, id, op_tensor.id);
-}
-
-Status add_kernel_tensor(ClKernelGraph &k_graph, const OperatorGraph::Implementation &op_graph, const OpTensorContent &op_tensor, DependencyGraph::Id &id)
-{
- // For a tensor t
- // 1. If t is a src tensor of the entire op graph, then it's Core.
- // (Optimisation opportunity, if we guanrantee that all translate methods are called in topological order, we can always assign t to Core.
- // Because even if the op is non-root (which would mean t should be an Aux tensor), the src tensors would be already be determined by the ancestor ops (topological order), and thus would not be overriden by it)
- // 2. If t is a dst tensor of the entire op graph, then it's Core.
- // 3. Aux tensor with Persistent and Prepare lifetime is manually specified
- // 4. All other ts not captured by the above are assigned Aux, with lifetime of Temporary.
- // kernel_graph.add_kernel_tensor(input->desc, );
- bool is_src_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.src_tensors());
- bool is_dst_tensor_of_graph = is_in(op_tensor.id, op_graph.graph.dst_tensors());
- MemoryType memory_type;
- AuxMemoryInfo memory_info;
- if(is_src_tensor_of_graph || is_dst_tensor_of_graph)
- {
- memory_type = MemoryType::Core;
- }
- else
- {
- memory_type = MemoryType::Auxiliary;
- memory_info.lifetime = AuxMemoryLifetime::Temporary;
- memory_info.size = op_tensor.desc->total_size();
- }
- return add_kernel_tensor(k_graph, op_graph, op_tensor, memory_type, memory_info, id);
-}
-
-/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
- *
- * @note Duplicate of the function with the same name in src/gpu/cl/operators/ClConv2d.cpp
- *
- * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
- *
- * @param[in] gpu_target GPU target
- *
- * @return the suitable kernel size for using direct convolution method with NHWC data layout
- */
-size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
-{
- switch(gpu_target)
- {
- case arm_compute::GPUTarget::G76:
- case arm_compute::GPUTarget::G77:
- case arm_compute::GPUTarget::G78:
- return 5;
- case arm_compute::GPUTarget::G71:
- case arm_compute::GPUTarget::G72:
- case arm_compute::GPUTarget::MIDGARD:
- case arm_compute::GPUTarget::BIFROST:
- return 7;
- default:
- return 5;
- }
-}
-} // namespace
-
-bool operator==(const OpTensor &t0, const OpTensor &t1)
-{
- return std::make_tuple(t0.id()) == std::make_tuple(t1.id());
-}
-bool operator==(const Conv2dDescriptor &conv2d0, const Conv2dDescriptor &conv2d1)
-{
- return std::make_tuple(conv2d0.stride, conv2d0.dilation) == std::make_tuple(conv2d1.stride, conv2d1.dilation);
-}
-
-bool operator==(const ElementwiseDescriptor &ed0, const ElementwiseDescriptor &ed1)
-{
- return ed0.op == ed1.op; // Compare Arithmatic Operations of two ElementwiseDescriptor objects
-}
-
-bool operator==(const FloorDescriptor &, const FloorDescriptor &)
-{
- return std::make_tuple() == std::make_tuple(); // Currently two Floor ops are always the same
-}
-
-bool Conv2dContent::operator==(const OperatorContent &other) const
-{
- const auto converted = *utils::cast::polymorphic_downcast<const Conv2dContent *>(&other);
- return desc == converted.desc;
-}
-
-bool ElementwiseContent::operator==(const OperatorContent &other) const
-{
- const auto converted = *utils::cast::polymorphic_downcast<const ElementwiseContent *>(&other);
- return desc == converted.desc;
-}
-
-bool FloorContent::operator==(const OperatorContent &other) const
-{
- const auto converted = *utils::cast::polymorphic_downcast<const FloorContent *>(&other);
- return desc == converted.desc;
-}
-
-ConvolutionMethod Conv2dContent::select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target)
-{
- // Modified from ClConv2d::get_convolution_method
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(src);
- ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
- ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
-
- const PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{});
- const Size2D dilation = conv2d_desc.dilation;
-
- const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
-
- /* Input spatial dims, kernel size, IFM/OFM, conv info*/
- using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
- using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
-
- const std::vector<ConfigurationMethod> known_configs =
- {
- // Alexnet
- ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
- // VGG16 / VGG19
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
- // Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
- // Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
- // Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
- // Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
- };
-
- const auto find_config = [&](ConfigurationMethod c)
- {
- const ConvolutionConfiguration config = c.first;
- const PadStrideInfo info = std::get<3>(config);
- const DataLayout data_layout = std::get<4>(config);
-
- return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == legacy_pad_stride.pad_top() && info.pad_right() == legacy_pad_stride.pad_right()
- && info.pad_bottom() == legacy_pad_stride.pad_bottom() && info.pad_left() == legacy_pad_stride.pad_left() && info.stride() == legacy_pad_stride.stride() && (data_layout == src->data_layout());
- };
-
- std::vector<ConfigurationMethod>::const_iterator found;
- if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
- {
- return (*found).second;
- }
-
- if(dilation != Size2D(1U, 1U))
- {
- return ConvolutionMethod::GEMM;
- }
- else
- {
- if(src->data_layout() == DataLayout::NCHW)
- {
- ARM_COMPUTE_ERROR("NCHW not supported");
- }
- else
- {
- const bool is_direct_valid = bool(ClDirectConv2dKernel::validate(src, weights, nullptr, dst, ClDirectConv2dKernelDescriptor{ conv2d_desc }));
- const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
-
- // SRGAN case
- if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv2d_desc.pad.top < 3)
- && is_direct_valid)
- {
- return ConvolutionMethod::DIRECT;
- }
-
- // Floating-point case: GeMM/Direct
- if(is_data_type_float(src->data_type()))
- {
- // Get dst shape
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, legacy_pad_stride);
- const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
- const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16;
- const bool is_ofm_lte_8 = weights->dimension(3U) <= 8;
- const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
- const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U);
-
- // Direct convolution case
- if(is_direct_valid)
- {
- if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || gpu_target == arm_compute::GPUTarget::MIDGARD))
- {
- if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
- {
- return ConvolutionMethod::DIRECT;
- }
- }
- else
- {
- if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
- {
- return ConvolutionMethod::DIRECT;
- }
- }
- }
-
- // Default case
- return ConvolutionMethod::GEMM;
- }
-
- // Generic case for quantized. Only GeMM
- return ConvolutionMethod::GEMM;
- }
- }
- return ConvolutionMethod::DIRECT;
-}
-
-Status Conv2dContent::translate(ClKernelGraph &kernel_graph) const
-{
- const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- const auto method = forced_method_enabled ? forced_method : Conv2dContent::select_conv_method(input->desc, weight->desc, dst->desc, desc, CLScheduler::get().target());
- switch(method)
- {
- case ConvolutionMethod::DIRECT:
- {
- return translate_direct_conv2d(kernel_graph);
- }
- default:
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Not implemented");
- }
- }
- return Status{};
-}
-Status Conv2dContent::translate_direct_conv2d(ClKernelGraph &kernel_graph) const
-{
- const auto input = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto weight = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
- const auto bias = _tensors.get_const_tensor(TensorType::ACL_SRC_2);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, dst);
-
- ITensorDescPack<ClKernelTensor> tensors;
-
- DependencyGraph::Id input_id;
- auto st = add_kernel_tensor(kernel_graph, *_graph, *input, input_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(input_id));
-
- DependencyGraph::Id weight_id;
- st = add_kernel_tensor(kernel_graph, *_graph, *weight, weight_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(weight_id));
-
- if(bias != nullptr)
- {
- DependencyGraph::Id bias_id;
- st = add_kernel_tensor(kernel_graph, *_graph, *bias, bias_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_SRC_2, kernel_graph.get_tensor(bias_id));
- }
-
- DependencyGraph::Id dst_id;
- st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
-
- DependencyGraph::Id direct_conv2d_id;
- const auto kernel_desc = ClDirectConv2dKernelDescriptor{ desc };
-
- st = ClDirectConv2dKernel::validate(input->desc, weight->desc, bias == nullptr ? nullptr : bias->desc, dst->desc, kernel_desc);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
-
- ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
- st = kernel_graph.add_kernel<ClDirectConv2dKernel>(config, kernel_desc, tensors, direct_conv2d_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- ARM_COMPUTE_UNUSED(direct_conv2d_id);
-
- return Status{};
-}
-
-Status ElementwiseContent::translate(ClKernelGraph &kernel_graph) const
-{
- const auto lhs = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto rhs = _tensors.get_const_tensor(TensorType::ACL_SRC_1);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
-
- ITensorDescPack<ClKernelTensor> tensors;
-
- DependencyGraph::Id lhs_id;
- auto st = add_kernel_tensor(kernel_graph, *_graph, *lhs, lhs_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(lhs_id));
-
- DependencyGraph::Id rhs_id;
- st = add_kernel_tensor(kernel_graph, *_graph, *rhs, rhs_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_SRC_1, kernel_graph.get_tensor(rhs_id));
-
- DependencyGraph::Id dst_id;
- st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
-
- DependencyGraph::Id add_id;
- ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
-
- st = ClElementwiseKernel::validate(lhs->desc, rhs->desc, dst->desc);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
-
- st = kernel_graph.add_kernel<ClElementwiseKernel>(config, ClElementwiseKernelDescriptor{ desc }, tensors, add_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- ARM_COMPUTE_UNUSED(add_id);
-
- return Status{};
-}
-
-Status FloorContent::translate(ClKernelGraph &kernel_graph) const
-{
- const auto src = _tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto dst = _tensors.get_const_tensor(TensorType::ACL_DST_0);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- ITensorDescPack<ClKernelTensor> tensors;
-
- DependencyGraph::Id src_id;
- auto st = add_kernel_tensor(kernel_graph, *_graph, *src, src_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_SRC_0, kernel_graph.get_tensor(src_id));
-
- DependencyGraph::Id dst_id;
- st = add_kernel_tensor(kernel_graph, *_graph, *dst, dst_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- tensors.add_const_tensor(ACL_DST_0, kernel_graph.get_tensor(dst_id));
-
- DependencyGraph::Id add_id;
- ClKernelConfig config{ UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }, TileDescriptor{}, StoreType::TStoreIndirectWidthSelect };
-
- st = ClFloorKernel::validate(src->desc, dst->desc);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
-
- st = kernel_graph.add_kernel<ClFloorKernel>(config, ClFloorKernelDescriptor{ desc }, tensors, add_id);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
-
- return Status{};
-}
-
-std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph)
-{
- std::vector<const OperatorContent *> ops;
- const auto sorted = graph.graph.topological_sort();
- for(const auto &pack : sorted.second)
- {
- ops.push_back(graph.operators.at(pack.op).get());
- }
- return ops;
-}
-
-std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph)
-{
- std::vector<OperatorContent *> ops;
- const auto sorted = graph.graph.topological_sort();
- for(const auto &pack : sorted.second)
- {
- ops.push_back(graph.operators.at(pack.op).get());
- }
- return ops;
-}
-
-Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph)
-{
- for(const auto &op : traverse(op_graph))
- {
- const auto st = op->translate(kernel_graph);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- }
- return Status{};
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
diff --git a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h b/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
deleted file mode 100644
index b303cdb9fc..0000000000
--- a/src/core/experimental/dynamic_fusion/WorkloadImpl/OperatorGraphImpl.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
-
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ITensorDescPack.h"
-
-#include "support/Cast.h"
-#include "support/DeepCopy.h"
-
-#include <map>
-#include <tuple>
-#include <type_traits>
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-enum class OperatorComplexity
-{
- Complex = 0,
- Simple
-};
-
-struct ClKernelGraph;
-struct OpTensorContent
-{
-public:
- using Id = DependencyGraph::Id;
- OpTensorContent() = default;
- OpTensorContent(Id id)
- : id{ id }, desc{}
- {
- }
- OpTensorContent(Id id, ITensorInfo *desc)
- : id{ id }, desc{ desc }
- {
- }
- ~OpTensorContent() = default;
- OpTensorContent(const OpTensorContent &) = default;
- OpTensorContent &operator=(const OpTensorContent &) = default;
- OpTensorContent(OpTensorContent &&) = default;
- OpTensorContent &operator=(OpTensorContent &&) = default;
- bool operator==(const OpTensorContent &other) const
- {
- return desc == other.desc;
- }
-
- const ITensorInfo *get_tensor_info() const
- {
- return desc;
- }
- ITensorInfo *get_tensor_info()
- {
- return desc;
- }
-
- Id id{};
- ITensorInfo *desc{};
-};
-
-struct OperatorContent
-{
-public:
- using Id = DependencyGraph::Id;
- OperatorContent() = default;
- OperatorContent(const OperatorGraph::Implementation *graph, Id id, const ITensorDescPack<OpTensorContent> &tensors)
- : _graph{ graph }, _id{ id }, _tensors{ tensors }
- {
- }
- OperatorContent(const OperatorContent &op) = default;
- OperatorContent &operator=(const OperatorContent &op) = default;
- OperatorContent(OperatorContent &&op) = default;
- OperatorContent &operator=(OperatorContent &&op) = default;
- virtual ~OperatorContent() = default;
- virtual OperatorComplexity complexity() const = 0;
- virtual bool operator==(const OperatorContent &other) const = 0;
- virtual Status translate(ClKernelGraph &kernel_graph) const = 0;
-
-protected:
- const OperatorGraph::Implementation *_graph {};
- Id _id{};
- ITensorDescPack<OpTensorContent> _tensors{};
-};
-
-struct Conv2dContent : public OperatorContent
-{
-public:
- Conv2dContent() = default;
- Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
- : OperatorContent(graph, id, tensors), desc(desc), forced_method(), forced_method_enabled(false)
- {
- }
- // Temporary. Do not need to pass ConvolutionMethod
- Conv2dContent(const OperatorGraph::Implementation *graph, Id id, const Conv2dDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors, ConvolutionMethod method)
- : OperatorContent(graph, id, tensors), desc(desc), forced_method(method), forced_method_enabled(true)
- {
- }
- ~Conv2dContent() = default;
- Conv2dContent(const Conv2dContent &) = default;
- Conv2dContent &operator=(const Conv2dContent &) = default;
- Conv2dContent(Conv2dContent &&) = default;
- Conv2dContent &operator=(Conv2dContent &&) = default;
- bool operator==(const OperatorContent &other) const override;
- OperatorComplexity complexity() const override
- {
- return OperatorComplexity::Complex;
- }
- void set_method(ConvolutionMethod method)
- {
- forced_method_enabled = true;
- forced_method = method;
- }
-
- Status translate(ClKernelGraph &kernel_graph) const override;
- /** Replicate heuristics of @ref ClConv2d::get_convolution_method(), except that non-supported data types and data layouts are removed from the heuristics
- *
- * @param src
- * @param weights
- * @param dst
- * @param conv2d_desc
- * @param gpu_target
- * @return ConvolutionMethod
- */
- static ConvolutionMethod select_conv_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dDescriptor &conv2d_desc, const GPUTarget gpu_target);
-
- Conv2dDescriptor desc{};
- ConvolutionMethod forced_method{ ConvolutionMethod::GEMM_CONV2D };
- bool forced_method_enabled{ false };
-
-private:
- Status translate_direct_conv2d(ClKernelGraph &kernel_graph) const;
-};
-
-class ElementwiseContent : public OperatorContent
-{
-public:
- ElementwiseContent() = default;
- ElementwiseContent(const OperatorGraph::Implementation *graph, Id id, const ElementwiseDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
- : OperatorContent(graph, id, tensors), desc(desc)
- {
- }
- ~ElementwiseContent() = default;
- ElementwiseContent(const ElementwiseContent &) = default;
- ElementwiseContent &operator=(const ElementwiseContent &) = default;
- ElementwiseContent(ElementwiseContent &&) = default;
- ElementwiseContent &operator=(ElementwiseContent &&) = default;
- bool operator==(const OperatorContent &other) const override;
- OperatorComplexity complexity() const override
- {
- return OperatorComplexity::Simple;
- }
- Status translate(ClKernelGraph &kernel_graph) const override;
-
-private:
- ElementwiseDescriptor desc{};
-};
-
-class FloorContent : public OperatorContent
-{
-public:
- FloorContent() = default;
- FloorContent(const OperatorGraph::Implementation *graph, Id id, const FloorDescriptor &desc, const ITensorDescPack<OpTensorContent> &tensors)
- : OperatorContent(graph, id, tensors), desc(desc)
- {
- }
- ~FloorContent() = default;
- FloorContent(const FloorContent &) = default;
- FloorContent &operator=(const FloorContent &) = default;
- FloorContent(FloorContent &&) = default;
- FloorContent &operator=(FloorContent &&) = default;
- bool operator==(const OperatorContent &other) const override;
- OperatorComplexity complexity() const override
- {
- return OperatorComplexity::Simple;
- }
- Status translate(ClKernelGraph &kernel_graph) const override;
-
-private:
- FloorDescriptor desc{};
-};
-
-struct OperatorGraph::Implementation
-{
-public:
- template <typename ContentT, typename... Args>
- void add_node(Operator::Id id, Args &&... args)
- {
- operators[id] = utils::memory::make_deep_unique<OperatorContent, ContentT>(this, id, std::forward<Args>(args)...);
- }
-
- template <typename... Args>
- void add_tensor(OpTensor::Id id, Args &&... args)
- {
- tensors[id] = utils::memory::make_deep_unique<OpTensorContent, OpTensorContent>(id, std::forward<Args>(args)...);
- }
-
- using Dependency = DependencyGraph;
- using OperatorMap = std::map<Operator::Id, utils::memory::deep_unique_ptr<OperatorContent>>;
- using OpTensorMap = std::map<OpTensor::Id, utils::memory::deep_unique_ptr<OpTensorContent>>;
-
- Implementation() = default;
- ~Implementation() = default;
-
- friend bool operator==(const OperatorGraph::Implementation &graph0, const OperatorGraph::Implementation &graph1)
- {
- return graph0.graph == graph1.graph && graph0.operators == graph1.operators && graph0.tensors == graph1.tensors;
- }
-
- Dependency graph{};
- OperatorMap operators{};
- OpTensorMap tensors{};
- Status status{};
-};
-
-std::vector<const OperatorContent *> traverse(const OperatorGraph::Implementation &graph);
-
-std::vector<OperatorContent *> traverse(OperatorGraph::Implementation &graph);
-
-Status translate(ClKernelGraph &kernel_graph, const OperatorGraph::Implementation &op_graph);
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-
-#endif //ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_OPERATORGRAPHIMPL
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
deleted file mode 100644
index 30e19d5907..0000000000
--- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/CLUtils.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-#include "src/gpu/cl/ClKernelLibrary.h"
-
-#include "support/Cast.h"
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-using namespace arm_compute::opencl;
-
-void ClCompositeKernel::configure(const ClCompileContext &compile_ctx, const ClKernelCode &cl_code)
-{
- // Create kernel from kernel source string
- opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
- _kernel = static_cast<cl::Kernel>(compile_ctx.create_kernel(cl_code.name,
- "" /* Program name: Used to as part of a unique string for built kernel cache. Not needed */,
- cl_code.code,
- klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
- cl_code.build_options.options(),
- false /* Is source binary */));
-
- // Configure execution window
- IClKernel::configure_internal(cl_code.window);
-
- // Set config id for lws tuning
- _config_id = cl_code.config_id;
-
- // Set kernel arguments
- _arguments = cl_code.arguments;
-}
-
-inline void ClCompositeKernel::add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
-{
- switch(arg.tensor_arg_type)
- {
- case ClKernelTensorArgType::Scalar:
- {
- ARM_COMPUTE_ERROR("Unsupported yet");
- break;
- }
-
- case ClKernelTensorArgType::Vector:
- {
- add_1D_tensor_argument(idx, tensor, arg_slice);
- break;
- }
-
- case ClKernelTensorArgType::Image:
- {
- add_2D_tensor_argument(idx, tensor, arg_slice);
- break;
- }
- case ClKernelTensorArgType::Image_Reinterpret_As_3D:
- {
- add_2D_tensor_argument(idx, tensor, arg_slice);
- const unsigned int total_cross_plane_pad = tensor->info()->padding().top + tensor->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
- break;
- }
- case ClKernelTensorArgType::Image_Export_To_ClImage2D:
- {
- const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
- const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
- cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
- cl_images.push_back(tensor_image2d);
- _kernel.setArg(idx++, tensor_image2d);
- break;
- }
-
- case ClKernelTensorArgType::Image_3D:
- {
- add_2D_tensor_argument(idx, tensor, arg_slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
- break;
- }
- case ClKernelTensorArgType::Image_3D_Export_To_ClImage2D:
- {
- const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
- const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1];
- cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch);
- cl_images.push_back(tensor_image2d);
- _kernel.setArg(idx++, tensor_image2d);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
- break;
- }
-
- case ClKernelTensorArgType::Tensor_3D:
- {
- add_3D_tensor_argument(idx, tensor, arg_slice);
- break;
- }
-
- case ClKernelTensorArgType::Tensor_4D:
- {
- add_4D_tensor_argument(idx, tensor, arg_slice);
- break;
- }
- case ClKernelTensorArgType::Tensor_4D_t_Buffer:
- {
- add_4d_tensor_nhwc_argument(idx, tensor);
- break;
- }
- case ClKernelTensorArgType::Tensor_4D_t_Image:
- {
- const size_t image_w = tensor->info()->dimension(0) / 4;
- const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1);
- const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
-
- cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
- TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y);
- cl_images.push_back(tensor_image2d);
-
- _kernel.setArg(idx++, tensor_image2d);
- add_4d_tensor_nhwc_argument(idx, tensor);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported");
- }
- }
-}
-
-void ClCompositeKernel::run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc)
-{
- ARM_COMPUTE_UNUSED(exec_desc);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
- // Don't slice matrix along the z dimension if matrix has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- Window slice_fixed_z = slice;
- slice_fixed_z.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_fixed_z.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- unsigned int idx = 0;
- do
- {
- // Set kernel arguments
- Window arg_slice = slice;
- // CLImages created from tensor arguments. Need to be retained until enqueue
- std::vector<cl::Image2D> cl_images;
- for(auto id_arg : _arguments)
- {
- const auto arg = id_arg.second;
- auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.arg_id));
- ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
- ARM_COMPUTE_ERROR_ON_NULLPTR(tensor->info());
- if(!arg.slide_along_dimz)
- {
- // The stride_z for matrix must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(tensor->info()->strides_in_bytes()[3] != 0);
- arg_slice = slice_fixed_z;
- }
- add_tensor_argument(idx, arg, tensor, arg_slice, cl_images);
- }
-
- // Dispatch kernel
- bool use_dummy_work_items = false;
- enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
- }
- while(!exec_desc.skip_sliding_window && window.slide_window_slice_3D(slice));
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h b/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
deleted file mode 100644
index 52b92be568..0000000000
--- a/src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#ifndef ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H
-#define ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H
-
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/IClKernel.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-struct ClExecutionDescriptor;
-struct ClKernelCode;
-
-class ClCompositeKernel final : public opencl::IClKernel
-{
-public:
- void configure(const opencl::ClCompileContext &, const ClKernelCode &);
-
- /** Run the composite kernel
- * @note The slots / keys in ITensorPack are the argument Ids of the tensors in blueprint
- *
- * @param tensors ITensorPack object containing run-time tensor memories
- * @param window Execution window
- * @param queue OpenCL Command queue
- * @param exec_desc Descriptor containing execution information
- */
- virtual void run_composite_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue, const ClExecutionDescriptor &exec_desc) override;
-
-private:
- /** Set a kernel tensor argument
- *
- * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
- * @param[in] arg Kernel argument descriptor accompanying @p tensor
- * @param[in] tensor Tensor to set as an argument of the object's kernel.
- * @param[in] arg_slice Window the kernel will be run on.
- * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
- */
- inline void add_tensor_argument(unsigned int &idx, const ClKernelArgDescriptor &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
-
-private:
- ClKernelArgList _arguments{}; /** All kernel arguments required by runtime */
-};
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif // ARM_COMPUTE_EXPERIMENTAL_DYNAMICFUSION_CLCOMPOSITEKERNEL_H
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp b/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp
deleted file mode 100644
index a53a73e4ec..0000000000
--- a/src/gpu/cl/operators/experimental/dynamic_fusion/ClCompositeOperator.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
-
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-namespace experimental
-{
-namespace dynamic_fusion
-{
-namespace
-{
-Status add_tensor_to_tensor_pack(int wk_tensor_id, ICLTensor *tensor, const ClWorkload &workload, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map)
-{
- if(tensor == nullptr)
- {
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
- }
- const auto bp_tensor_id = workload.tensors.at(wk_tensor_id).kernel_arg.arg_id; // blueprint tensor id
- std::vector<ClWorkload::UnitWorkId> uwk_ids{};
- const auto src_uwk_ids = workload.graph.src_ops_from_tensor(wk_tensor_id);
- const auto dst_uwk_ids = workload.graph.dst_ops_from_tensor(wk_tensor_id);
- uwk_ids.insert(uwk_ids.end(), src_uwk_ids.begin(), src_uwk_ids.end());
- uwk_ids.insert(uwk_ids.end(), dst_uwk_ids.begin(), dst_uwk_ids.end());
-
- for(auto uwk_id : uwk_ids)
- {
- TensorPackMap *pack_map = nullptr;
- const auto uwk_stage = workload.unit_workloads.at(uwk_id).stage.stage;
- switch(uwk_stage)
- {
- case UnitWorkloadStage::Stage::Run:
- pack_map = &run_pack_map;
- break;
- case UnitWorkloadStage::Stage::Prepare:
- pack_map = &prepare_pack_map;
- break;
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported workload stage");
- }
-
- ITensorPack *tensor_pack = pack_map->find_tensor_pack(uwk_id);
- if(tensor_pack == nullptr)
- {
- pack_map->add_tensor_pack(uwk_id, ITensorPack{ { bp_tensor_id, tensor } });
- }
- else
- {
- tensor_pack->add_tensor(bp_tensor_id, tensor);
- }
- }
- return Status{};
-}
-
-} // namespace
-
-ITensorPack *TensorPackMap::find_tensor_pack(UnitWorkload::Id uwk_id)
-{
- auto tensor_pack = _tensor_packs.find(uwk_id);
- if(tensor_pack != _tensor_packs.end())
- {
- return &(tensor_pack->second);
- }
- return nullptr;
-}
-
-ITensorPack &TensorPackMap::get_tensor_pack(UnitWorkload::Id uwk_id)
-{
- return _tensor_packs.at(uwk_id);
-}
-
-void TensorPackMap::add_tensor_pack(UnitWorkload::Id uwk_id, const ITensorPack &tensor_pack)
-{
- _tensor_packs[uwk_id] = tensor_pack;
-}
-
-Status bind_tensors(ClAuxTensorData &aux_tensor_data, TensorPackMap &prepare_pack_map, TensorPackMap &run_pack_map, const ClWorkload &workload, const OpTensorBinding &op_tensors)
-{
- for(auto tensor : workload.tensors)
- {
- const auto wk_tensor_id = tensor.first; // workload tensor id
- ICLTensor *tensor_object = nullptr;
- if(tensor.second.memory_type == MemoryType::Core)
- {
- const auto op_tensor_id = workload.op_tensor_id_lut.at(wk_tensor_id);
- auto op_tensor_find = op_tensors.find(op_tensor_id);
- if(op_tensor_find == op_tensors.end())
- {
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Cannot find binding for some operator tensor");
- }
- tensor_object = utils::cast::polymorphic_downcast<ICLTensor *>(op_tensor_find->second);
- }
- else if(tensor.second.memory_type == MemoryType::Auxiliary)
- {
- // Create aux tensor CLTensor object
- const TensorInfo tensor_info = *tensor.second.info;
- const auto memory_info = tensor.second.memory_info;
- tensor_object = aux_tensor_data.add_aux_tensor(wk_tensor_id, tensor_info, memory_info);
- }
- else
- {
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported tensor memory type");
- }
-
- const auto st = add_tensor_to_tensor_pack(wk_tensor_id, tensor_object, workload, prepare_pack_map, run_pack_map);
- ARM_COMPUTE_RETURN_ON_ERROR(st);
- }
- return Status{};
-}
-
-CLTensor *ClAuxTensorData::add_aux_tensor(int tensor_id, const ITensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
-{
- auto find_tensor_pair = _owned_tensors.find(tensor_id);
- if(find_tensor_pair == _owned_tensors.end())
- {
- return find_tensor_pair->second.get();
- }
- else
- {
- auto tensor = std::make_unique<CLTensor>();
- auto inserted_pair = _owned_tensors.emplace(tensor_id, std::move(tensor)).first;
- auto new_tensor = inserted_pair->second.get();
- _tensors.emplace_back(new_tensor, tensor_info, memory_info);
- return new_tensor;
- }
-}
-
-std::vector<ClAuxTensorData::DataView> &ClAuxTensorData::get_tensors()
-{
- return _tensors;
-}
-struct ClCompositeOperator::Implementation
-{
- std::map<UnitWorkload::Id, std::unique_ptr<ClCompositeKernel>> _kernels{};
- std::map<UnitWorkload::Id, std::unique_ptr<ClCompositeKernel>> _kernels_prep{};
- ClWorkload _workload{};
- bool _is_prepared{ false };
-};
-
-ClCompositeOperator::ClCompositeOperator()
- : _impl{ std::make_unique<Implementation>() }
-{
-}
-
-ClCompositeOperator::~ClCompositeOperator() = default;
-
-void ClCompositeOperator::configure(const CLCompileContext &ctx, const ClWorkload &workload)
-{
- ARM_COMPUTE_ERROR_THROW_ON(ClCompositeOperator::validate(workload));
- _impl->_workload = workload;
-
- // Traverse workloads in topological order
- const auto sorted = workload.graph.topological_sort().second;
- for(const auto &node : sorted)
- {
- auto work = workload.unit_workloads.at(node.op);
- auto stage = work.stage.stage;
- auto k = std::make_unique<ClCompositeKernel>();
- k->configure(ctx, work.code);
-
- switch(stage)
- {
- case UnitWorkloadStage::Stage::Run:
- _impl->_kernels.emplace(work.id, std::move(k));
- break;
- case UnitWorkloadStage::Stage::Prepare:
- _impl->_kernels_prep.emplace(work.id, std::move(k));
- break;
- default:
- ARM_COMPUTE_ERROR("Invalid stage");
- }
- break;
- }
-}
-
-Status ClCompositeOperator::validate(const ClWorkload &workload)
-{
- return workload.status;
-}
-
-void ClCompositeOperator::prepare(TensorPackMap &tensor_pack_map)
-{
- if(!_impl->_is_prepared)
- {
- for(auto &id_kernel_pair : _impl->_kernels_prep)
- {
- const bool flush_queue = false;
- const auto uwk_id = id_kernel_pair.first;
- auto kernel = id_kernel_pair.second.get();
- CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue);
- }
-
- _impl->_is_prepared = true;
- }
-}
-
-void ClCompositeOperator::run(TensorPackMap &tensor_pack_map)
-{
- ARM_COMPUTE_ERROR_ON_MSG(!_impl->_is_prepared, "Operator is not prepared");
-
- for(auto &id_kernel_pair : _impl->_kernels)
- {
- // Flush the command queue on the last kernel
- const bool flush_queue = false;
- const auto uwk_id = id_kernel_pair.first;
- auto kernel = id_kernel_pair.second.get();
- CLScheduler::get().enqueue_op(*kernel, tensor_pack_map.get_tensor_pack(uwk_id), ClExecutionDescriptor{}, flush_queue);
- }
-}
-
-} // namespace dynamic_fusion
-} // namespace experimental
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 8d30c05361..49fb724cdb 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -27,10 +27,6 @@
#include "arm_compute/runtime/CL/CLTuner.h"
#include "src/core/CL/ICLKernel.h"
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
namespace arm_compute
{
cl::Context &CLScheduler::context()
@@ -190,34 +186,6 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
flush_queue(flush);
}
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush)
-{
- ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
- "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
- or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
-
- // ClCompositeKernel is stateless thus alway requires memory injection
-
- // Tune the kernel if the CLTuner has been provided
- if(_cl_tuner != nullptr)
- {
- _cl_tuner->tune_kernel_dynamic(kernel, tensors, exec_desc);
- }
-
- // Run kernel
- kernel.run_composite_op(tensors, kernel.window(), _queue, exec_desc);
- if(_job_chaining_enabled)
- {
- ++_job_chaining_count;
- }
-
- flush_queue(flush);
-}
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
void CLScheduler::flush_queue(bool flush)
{
if(_job_chaining_enabled)
@@ -245,15 +213,6 @@ void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush
enqueue_common(kernel, tensors, flush);
}
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
-void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc, bool flush)
-{
- enqueue_common(kernel, tensors, exec_desc, flush);
-}
-
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
void CLScheduler::enable_job_chaining(int job_chaining_size)
{
_job_chaining_enabled = true;
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 8ce5177847..1cc20f0c1e 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -28,9 +28,6 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "src/core/CL/ICLKernel.h"
#include "support/StringSupport.h"
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
#include <cerrno>
#include <fstream>
@@ -65,26 +62,6 @@ private:
ITensorPack &_tensors;
};
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-struct CompositeKernelData : public CLTuner::IKernelData
-{
- CompositeKernelData(ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
- : _tensors{ tensors }, _exec_desc{ exec_desc }
- {
- }
- ~CompositeKernelData() override = default;
- void do_run(ICLKernel &kernel, cl::CommandQueue &queue) override
- {
- // ClCompositeKernel is purely stateless, and thus always requires memory injection
- kernel.run_composite_op(_tensors, kernel.window(), queue, _exec_desc);
- }
-
-private:
- ITensorPack &_tensors;
- const experimental::dynamic_fusion::ClExecutionDescriptor &_exec_desc;
-};
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
bool CLTuner::kernel_event_is_set() const
{
return _kernel_event() != nullptr;
@@ -165,15 +142,6 @@ void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
do_tune_kernel_dynamic(kernel, &data);
}
-#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors, const experimental::dynamic_fusion::ClExecutionDescriptor &exec_desc)
-{
- CompositeKernelData data{ tensors, exec_desc };
-
- do_tune_kernel_dynamic(kernel, &data);
-}
-#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
-
void CLTuner::add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params)
{
_tuning_params_table.emplace(kernel_id, optimal_tuning_params);
diff --git a/tests/SConscript b/tests/SConscript
index 87b654385a..8596cfa042 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -120,7 +120,6 @@ files_validation += Glob('validation/CPP/' + filter_pattern)
if env['opencl']:
if env['experimental_dynamic_fusion']:
- test_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern)
files_validation += Glob('validation/dynamic_fusion/gpu/cl/' + filter_pattern)
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp
deleted file mode 100644
index 1b1e8aa761..0000000000
--- a/tests/validation/CL/UNIT/dynamic_fusion/ArbitraryElementwiseFusion.cpp
+++ /dev/null
@@ -1,394 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-#include "src/core/utils/helpers/float_ops.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/reference/ConvolutionLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
-#include "tests/validation/reference/Permute.h"
-
-#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
-#include "tests/validation/reference/Floor.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
-
-using namespace arm_compute::experimental::dynamic_fusion;
-using namespace arm_compute::test::validation::utils;
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(UNIT)
-TEST_SUITE(DYNAMIC_FUSION)
-TEST_SUITE(ArbitraryFusion)
-
-TEST_CASE(ElementwiseBroadcasting, framework::DatasetMode::ALL)
-{
- // Test elementwise broadcasting
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
-
- const auto input_shape = TensorShape(7, 9, 5);
- const auto rhs_shape = TensorShape(7, 1, 1);
- const auto dst_shape = TensorShape(7, 9, 5);
-
- // Tensor Info
- auto input_info = TensorInfo(input_shape, 1, data_type, data_layout);
- auto addend_info = TensorInfo(rhs_shape, 1, data_type, data_layout);
- auto dst_info = TensorInfo();
-
- ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
-
- CLScheduler::get().default_reinit();
- const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
- OperatorGraph op_graph;
-
- const auto op_input = add_tensor(op_graph, input_info);
- const auto op_addend = add_tensor(op_graph, addend_info);
- const auto op_dst = add_tensor(op_graph, dst_info);
-
- add_op_elementwise_op(op_graph, add_desc, op_input, op_addend, op_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- build(workload, op_graph, workload_ctx);
-
- ClCompositeOperator op;
- op.configure(cl_compile_ctx, workload);
-
- // Construct tensors
- CLTensor t_input{};
- CLTensor t_addend{};
- CLTensor t_dst{};
-
- // Init tensors
- t_input.allocator()->init(input_info);
- t_addend.allocator()->init(addend_info);
- t_dst.allocator()->init(dst_info);
-
- // Allocate and fill tensors
- t_input.allocator()->allocate();
- t_addend.allocator()->allocate();
- t_dst.allocator()->allocate();
-
- // Fill
- fill<float>(CLAccessor(t_input), 0, library.get());
- fill<float>(CLAccessor(t_addend), 1, library.get());
-
- // Pack tensors
- OpTensorBinding bp_tensors({ { op_input, &t_input },
- { op_addend, &t_addend },
- { op_dst, &t_dst }
- });
-
- // Populate prepare and run pack-maps (including allocating aux tensors)
- ClAuxTensorData aux_tensor_data{};
- TensorPackMap prepare_pack_map{};
- TensorPackMap run_pack_map{};
- bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
-
- op.prepare(prepare_pack_map);
- op.run(run_pack_map);
-
- // Create reference
- SimpleTensor<float> ref_input{ input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_addend{ rhs_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
-
- // Fill reference
- fill<float>(ref_input, 0, library.get());
- fill<float>(ref_addend, 1, library.get());
-
- auto ref_input_nchw = reference::permute(ref_input, PermutationVector(1U, 2U, 0U));
- auto ref_addend_nchw = reference::permute(ref_addend, PermutationVector(1U, 2U, 0U));
-
- auto dst_shape_nchw = dst_shape;
- permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
-
- auto ref_t_dst_nchw = reference::arithmetic_operation(
- ArithmeticOperation::ADD,
- ref_input_nchw,
- ref_addend_nchw,
- data_type,
- ConvertPolicy{});
-
- const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
-
- RelativeTolerance<float> tolerance_f32(0.001f);
- validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
-}
-TEST_CASE(DivFloor, framework::DatasetMode::ALL)
-{
- // x = floor(div(input, input2))
- const auto data_type = DataType::F32;
- const auto eltwise_info = ElementwiseDescriptor{ ArithmeticOperation::DIV };
-
- // Tensor Values
- const auto width = 7U;
- const auto height = 6U;
-
- // Shapes
- const auto input1_shape = TensorShape(width, height);
- const auto input2_shape = TensorShape(width, height);
- const auto dst_shape = TensorShape(width, height);
-
- // Create reference
- SimpleTensor<float> ref_src_nhwc{ input1_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_src2_nhwc{ input2_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
-
- // Fill reference
- fill<float>(ref_src_nhwc, 0, library.get());
- fill<float>(ref_src2_nhwc, 1, library.get());
-
- auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U));
- auto ref_src2 = reference::permute(ref_src2_nhwc, PermutationVector(1U, 2U, 0U));
-
- TensorShape dst_shape_nchw{ dst_shape };
- permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
-
- const auto ref_dst_nchw = reference::floor_layer(reference::arithmetic_operation(
- ArithmeticOperation::DIV,
- ref_src,
- ref_src2,
- data_type,
- ConvertPolicy::SATURATE));
-
- const auto ref_t_dst = reference::permute(ref_dst_nchw, PermutationVector(2U, 0U, 1U));
-
- // Tensor Info
- auto input1_info = TensorInfo(input1_shape, 1, data_type, DataLayout::NHWC);
- auto input2_info = TensorInfo(input2_shape, 1, data_type, DataLayout::NHWC);
- auto dst_info = TensorInfo();
- auto acc_info = TensorInfo(); // Intermediate tensor for division
-
- // Initialise Scheduler
- CLScheduler::get().default_reinit();
- const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
- OperatorGraph op_graph;
-
- // add tensors
- auto op_input1 = add_tensor(op_graph, input1_info);
- auto op_input2 = add_tensor(op_graph, input2_info);
- auto op_acc = add_tensor(op_graph, acc_info);
- auto op_dst = add_tensor(op_graph, dst_info);
-
- add_op_elementwise_op(op_graph, eltwise_info, op_input1, op_input2, op_acc);
- add_op_floor(op_graph, FloorDescriptor(), op_acc, op_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- build(workload, op_graph, workload_ctx);
-
- ClCompositeOperator op;
- op.configure(cl_compile_ctx, workload);
-
- // Configure and add tensors.
- CLTensor t_input1{};
- CLTensor t_input2{};
- CLTensor t_dst{};
-
- // Init Tensors
- t_input1.allocator()->init(input1_info);
- t_input2.allocator()->init(input2_info);
- t_dst.allocator()->init(dst_info);
-
- // Allocate and fill tensors
- t_input1.allocator()->allocate();
- t_input2.allocator()->allocate();
- t_dst.allocator()->allocate();
-
- fill<float>(CLAccessor(t_input1), 0, library.get());
- fill<float>(CLAccessor(t_input2), 1, library.get());
-
- // "Pack" tensors
- OpTensorBinding bp_tensors({ { op_input1, &t_input1 },
- { op_input2, &t_input2 },
- { op_dst, &t_dst }
- });
-
- // Populate prepare and run pack-maps (including allocating aux tensors)
- ClAuxTensorData aux_tensor_data{};
- TensorPackMap prepare_pack_map{};
- TensorPackMap run_pack_map{};
- bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
-
- op.prepare(prepare_pack_map);
- op.run(run_pack_map);
-
- RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
- validate(CLAccessor(t_dst), ref_dst_nchw, tolerance_f32);
-}
-TEST_CASE(Dconv2dAddDiv, framework::DatasetMode::ALL)
-{
- // output = div(divend, add(addend, conv2d1x1(direct_conv)(input, weights, bias)))
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
-
- const auto input_shape = TensorShape(384, 12, 12);
- const auto weight_shape = TensorShape(384, 1, 1, 16);
- const auto dst_shape = TensorShape(16, 12, 12);
-
- // Tensor Info
- auto input_info = TensorInfo(input_shape, 1, data_type, data_layout);
- auto weight_info = TensorInfo(weight_shape, 1, data_type, data_layout);
- auto addend_info = TensorInfo(dst_shape, 1, data_type, data_layout);
- auto divend_info = TensorInfo(dst_shape, 1, data_type, data_layout);
- auto acc_info = TensorInfo(); // Intermediate tensor for conv
- auto acc_1_info = TensorInfo();
- auto dst_info = TensorInfo();
-
- Conv2dDescriptor conv2d_desc{};
- ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
- ElementwiseDescriptor div_desc{ ArithmeticOperation::DIV };
-
- CLScheduler::get().default_reinit();
- const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
- OperatorGraph op_graph;
-
- const auto op_input = add_tensor(op_graph, input_info);
- const auto op_weight = add_tensor(op_graph, weight_info);
- const auto op_addend = add_tensor(op_graph, addend_info);
- const auto op_divend = add_tensor(op_graph, divend_info);
- const auto op_acc = add_tensor(op_graph, acc_info); // temp accumulator; TensorInfo to be inferred
- const auto op_acc_1 = add_tensor(op_graph, acc_1_info); // temp accumulator; TensorInfo to be inferred
- const auto op_dst = add_tensor(op_graph, dst_info);
-
- auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_input, op_weight, op_acc);
- force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
- add_op_elementwise_op(op_graph, add_desc, op_acc, op_addend, op_acc_1);
- add_op_elementwise_op(op_graph, div_desc, op_acc_1, op_divend, op_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- build(workload, op_graph, workload_ctx);
-
- ClCompositeOperator op;
- op.configure(cl_compile_ctx, workload);
-
- // Construct tensors
- CLTensor t_input{};
- CLTensor t_weight{};
- CLTensor t_addend{};
- CLTensor t_divend{};
- CLTensor t_dst{};
-
- // Init tensors
- t_input.allocator()->init(input_info);
- t_weight.allocator()->init(weight_info);
- t_divend.allocator()->init(divend_info);
- t_addend.allocator()->init(addend_info);
- t_dst.allocator()->init(dst_info);
-
- // Allocate and fill tensors
- t_input.allocator()->allocate();
- t_weight.allocator()->allocate();
- t_divend.allocator()->allocate();
- t_addend.allocator()->allocate();
- t_dst.allocator()->allocate();
-
- // Fill
- fill<float>(CLAccessor(t_input), 0, library.get());
- fill<float>(CLAccessor(t_weight), 1, library.get());
- fill<float>(CLAccessor(t_addend), 2, library.get());
- fill<float>(CLAccessor(t_divend), 3, library.get());
-
- // Pack tensors
- OpTensorBinding bp_tensors({ { op_input, &t_input },
- { op_weight, &t_weight },
- { op_addend, &t_addend },
- { op_divend, &t_divend },
- { op_dst, &t_dst }
- });
-
- // Populate prepare and run pack-maps (including allocating aux tensors)
- ClAuxTensorData aux_tensor_data{};
- TensorPackMap prepare_pack_map{};
- TensorPackMap run_pack_map{};
- bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
-
- op.prepare(prepare_pack_map);
- op.run(run_pack_map);
-
- // Create reference
- SimpleTensor<float> ref_input{ input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_weight{ weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_bias_placeholder{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_addend{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_divend{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
-
- // Fill reference
- fill<float>(ref_input, 0, library.get());
- fill<float>(ref_weight, 1, library.get());
- fill<float>(ref_addend, 2, library.get());
- fill<float>(ref_divend, 3, library.get());
-
- auto ref_input_nchw = reference::permute(ref_input, PermutationVector(1U, 2U, 0U));
- auto ref_weight_nchw = reference::permute(ref_weight, PermutationVector(1U, 2U, 0U));
- auto ref_bias_placeholder_nchw = reference::permute(ref_bias_placeholder, PermutationVector(1U, 2U, 0U));
- auto ref_addend_nchw = reference::permute(ref_addend, PermutationVector(1U, 2U, 0U));
- auto ref_divend_nchw = reference::permute(ref_divend, PermutationVector(1U, 2U, 0U));
-
- auto dst_shape_nchw = dst_shape;
- permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
-
- PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{});
- auto ref_acc_nchw = reference::arithmetic_operation(
- ArithmeticOperation::ADD,
- ref_addend_nchw,
- reference::convolution_layer(ref_input_nchw, ref_weight_nchw, ref_bias_placeholder_nchw, dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation),
- data_type,
- ConvertPolicy{});
-
- auto ref_t_dst_nchw = reference::arithmetic_operation(
- ArithmeticOperation::DIV,
- ref_acc_nchw,
- ref_divend_nchw,
- data_type,
- ConvertPolicy{});
-
- const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
-
- RelativeTolerance<float> tolerance_f32(0.001f);
- validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
-}
-
-TEST_SUITE_END() // ArbitraryFusion
-TEST_SUITE_END() // DYNAMIC_FUSION
-TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // CL
-
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
deleted file mode 100644
index dc98d72f4b..0000000000
--- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-
-#include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h"
-#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h"
-
-#include "src/core/utils/helpers/float_ops.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
-#include "tests/validation/Validation.h"
-#include "tests/validation/reference/ConvolutionLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
-#include "tests/validation/reference/GEMM.h"
-#include "tests/validation/reference/Permute.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
-
-#include <chrono>
-
-using namespace arm_compute::experimental::dynamic_fusion;
-using namespace arm_compute::test::validation::utils;
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(UNIT)
-TEST_SUITE(DYNAMIC_FUSION)
-TEST_SUITE(ClCompositeKernel)
-TEST_SUITE(Validate)
-
-TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL)
-{
- /* Computation:
- * out = add(addend, direct_conv2d(lhs, rhs, bias)) (non-broadcast)
- */
-
- ClCompositeKernel kernel{};
- ClKernelBlueprint bp{};
- ClKernelCode cl_code{};
- ClExecutionDescriptor exec_desc{};
- Status st{};
-
- const auto data_type = DataType::F32;
- const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ };
- const auto eltwise_info = ElementwiseDescriptor{ ArithmeticOperation::ADD };
-
- const auto width = 7U;
- const auto height = 6U;
- const auto IFM = 5U;
- const auto OFM = 4U;
- const auto kernel_sz = 3U;
-
- const auto src_shape = TensorShape(IFM, width, height);
- const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM);
- const auto bia_shape = TensorShape(OFM);
- const auto addend_shape = TensorShape(1, 1);
- const auto dst_shape = TensorShape(OFM, width, height);
-
- auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC);
- auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC);
- auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC);
- auto addend_info = TensorInfo(addend_shape, 1, data_type, DataLayout::NHWC);
- auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC);
-
- const auto n0 = std::min(OFM, 4u);
- const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
-
- const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info };
- const ClElementwiseKernelDescriptor eltwise_add_desc{ eltwise_info };
- const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT };
-
- ArgumentID src_id{ g_arg_placeholder };
- ArgumentID wei_id{ g_arg_placeholder };
- ArgumentID bia_id{ g_arg_placeholder };
- ArgumentID acc_id{ g_arg_placeholder };
- ArgumentID acc_1_id{ g_arg_placeholder };
- ArgumentID addend_id{ g_arg_placeholder };
- ArgumentID dst_id{ g_arg_placeholder };
-
- st = add_tensor(bp, &src_info, src_id);
- st = add_tensor(bp, &wei_info, wei_id);
- st = add_tensor(bp, &bia_info, bia_id);
- st = add_tensor(bp, &dst_info, acc_id);
- st = add_tensor(bp, &dst_info, acc_1_id);
- st = add_tensor(bp, &addend_info, addend_id);
- st = add_tensor(bp, &dst_info, dst_id);
-
- st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id);
- st = add_kcomp_eltwise_op(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id);
- st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id);
-
- exec_desc.skip_sliding_window = true;
-
- st = set_tile_info(bp, store_tile_info);
- st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp);
- st = tune_static(exec_desc, cl_code);
-
- CLScheduler::get().default_reinit();
- kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code);
-
- // Construct tensors
- CLTensor src{};
- CLTensor wei{};
- CLTensor bia{};
- CLTensor addend{};
- CLTensor dst{};
-
- // Init tensors
- src.allocator()->init(src_info);
- wei.allocator()->init(wei_info);
- bia.allocator()->init(bia_info);
- addend.allocator()->init(dst_info);
- dst.allocator()->init(dst_info);
-
- // "Pack" tensors
- ITensorPack tensors{ { src_id, &src },
- { wei_id, &wei },
- { bia_id, &bia },
- { addend_id, &addend },
- { dst_id, &dst } };
-
- // Allocate and fill tensors
- src.allocator()->allocate();
- wei.allocator()->allocate();
- bia.allocator()->allocate();
- addend.allocator()->allocate();
- dst.allocator()->allocate();
-
- fill<float>(CLAccessor(src), 0, library.get());
- fill<float>(CLAccessor(wei), 1, library.get());
- fill<float>(CLAccessor(bia), 2, library.get());
- fill<float>(CLAccessor(addend), 3, library.get());
-
- CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true);
-
- // Create reference
- SimpleTensor<float> ref_src_nhwc{ src_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_wei_nhwc{ wei_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_bia_nhwc{ bia_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_addend_nhwc{ addend_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
-
- // Fill reference
- fill<float>(ref_src_nhwc, 0, library.get());
- fill<float>(ref_wei_nhwc, 1, library.get());
- fill<float>(ref_bia_nhwc, 2, library.get());
- fill<float>(ref_addend_nhwc, 3, library.get());
-
- auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U));
- auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U));
- auto ref_bia = reference::permute(ref_bia_nhwc, PermutationVector(1U, 2U, 0U));
- auto ref_addend = reference::permute(ref_addend_nhwc, PermutationVector(1U, 2U, 0U));
-
- TensorShape dst_shape_nchw{ dst_shape };
- permute(dst_shape_nchw, PermutationVector(1U, 2U, 0U));
-
- const auto ref_dst = reference::arithmetic_operation(
- ArithmeticOperation::ADD,
- ref_addend,
- reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw,
- PadStrideInfo
- {
- static_cast<unsigned int>(conv_info.stride.x()),
- static_cast<unsigned int>(conv_info.stride.y()),
- static_cast<unsigned int>(conv_info.pad.left),
- static_cast<unsigned int>(conv_info.pad.top) }),
- data_type,
- ConvertPolicy::SATURATE);
-
- RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
- validate(CLAccessor(dst), ref_dst, tolerance_f32);
-}
-
-TEST_SUITE_END() // Validate
-TEST_SUITE_END() // ClCompositeKernel
-TEST_SUITE_END() // DYNAMIC_FUSION
-TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp b/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp
deleted file mode 100644
index 1824efff99..0000000000
--- a/tests/validation/CL/UNIT/dynamic_fusion/DependencyGraph.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/experimental/DependencyGraph.h"
-
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-
-using namespace arm_compute::experimental::dynamic_fusion;
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-
-TEST_SUITE(UNIT)
-TEST_SUITE(DYNAMIC_FUSION)
-TEST_SUITE(DependencyGraph)
-
-TEST_CASE(Correct_Graph_Creation_Should_Pass, framework::DatasetMode::ALL)
-{
- DependencyGraph graph{};
- const auto t0 = graph.add_tensor();
- const auto t1 = graph.add_tensor();
- const auto t2 = graph.add_tensor();
- const auto t3 = graph.add_tensor();
- const auto t4 = graph.add_tensor();
-
- const auto o0 = graph.add_operator({ t0, t1 }, { t2 }).second;
- const auto o1 = graph.add_operator({ t3, t2 }, { t4 }).second;
-
- ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_ops(), 2U, framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT_EQUAL(graph.number_of_tensors(), 5U, framework::LogLevel::ERRORS);
-
- const DependencyGraph ref_graph
- {
- {
- // src_tensors
- { o0, { t0, t1 } },
- { o1, { t3, t2 } },
- },
- {
- // dst_tensors
- { o0, { t2 } },
- { o1, { t4 } },
- },
- {
- // src_ops
- { t0, {} },
- { t1, {} },
- { t2, { o0 } },
- { t3, {} },
- { t4, { o1 } },
- },
- {
- // dst_ops
- { t0, { o0 } },
- { t1, { o0 } },
- { t2, { o1 } },
- { t3, { o1 } },
- { t4, {} },
- }
-
- };
- ARM_COMPUTE_EXPECT(graph == ref_graph, framework::LogLevel::ERRORS);
-}
-
-TEST_CASE(Correct_Merge_Points_Should_Enable_Graph_Expansion, framework::DatasetMode::ALL)
-{
- // Merge points are a simple way to collapse "graph of graphs" into a single graph
- // Suppose we have a top-level graph g0
- DependencyGraph g0{};
- const auto g0_t0 = g0.add_tensor();
- const auto g0_t1 = g0.add_tensor();
- const auto g0_t2 = g0.add_tensor();
- const auto g0_t3 = g0.add_tensor();
- const auto g0_t4 = g0.add_tensor();
- g0.add_operator({ g0_t0, g0_t1 }, { g0_t2 }); // g0_o0
- g0.add_operator({ g0_t3, g0_t2 }, { g0_t4 }); // g0_o1
-
- // Then g0 expands into g1, with additional nodes added in-between "merge point tensors"
- // Note that the expansion logic may be local to each operator node
- DependencyGraph g1{};
- // g0_o0 expands into g1_o0, g1_o1, g1_o2
- const auto g1_t0 = g1.add_tensor(g0_t0);
- const auto g1_t1 = g1.add_tensor(g0_t1);
- const auto g1_t2 = g1.add_tensor();
- const auto g1_t3 = g1.add_tensor();
- const auto g1_t4 = g1.add_tensor(g0_t2);
- const auto g1_o0 = g1.add_operator({ g1_t0 }, { g1_t2 }).second;
- const auto g1_o1 = g1.add_operator({ g1_t1 }, { g1_t3 }).second;
- const auto g1_o2 = g1.add_operator({ g1_t2, g1_t3 }, { g1_t4 }).second;
-
- // g0_o1 expands into g1_o3
- const auto g1_t5 = g1.add_tensor(g0_t3);
- const auto g1_t6 = g1.add_tensor(g0_t2);
- const auto g1_t7 = g1.add_tensor(g0_t4);
- ARM_COMPUTE_EXPECT_EQUAL(g1_t4, g1_t6, framework::LogLevel::ERRORS); // both associate with the same merge point g0_t2, thus they should point to the same tensor in g1
- const auto g1_o3 = g1.add_operator({ g1_t5, g1_t6 }, { g1_t7 }).second;
-
- const DependencyGraph ref_graph
- {
- {
- // src_tensors
- { g1_o0, { g1_t0 } },
- { g1_o1, { g1_t1 } },
- { g1_o2, { g1_t2, g1_t3 } },
- { g1_o3, { g1_t5, g1_t4 } },
- },
- {
- // dst_tensors
- { g1_o0, { g1_t2 } },
- { g1_o1, { g1_t3 } },
- { g1_o2, { g1_t4 } },
- { g1_o3, { g1_t7 } },
- },
- {
- // src_ops
- { g1_t0, {} },
- { g1_t1, {} },
- { g1_t2, { g1_o0 } },
- { g1_t3, { g1_o1 } },
- { g1_t4, { g1_o2 } },
- { g1_t5, {} },
- { g1_t7, { g1_o3 } },
- },
- {
- // dst_ops
- { g1_t0, { g1_o0 } },
- { g1_t1, { g1_o1 } },
- { g1_t2, { g1_o2 } },
- { g1_t3, { g1_o2 } },
- { g1_t4, { g1_o3 } },
- { g1_t5, { g1_o3 } },
- { g1_t7, {} },
- },
- {
- // merge points
- { g0_t0, g1_t0 },
- { g0_t1, g1_t1 },
- { g0_t2, g1_t4 },
- { g0_t3, g1_t5 },
- { g0_t4, g1_t7 },
- }
- };
- ARM_COMPUTE_EXPECT(g1 == ref_graph, framework::LogLevel::ERRORS);
-}
-
-TEST_CASE(Path_Existence_Check_0, framework::DatasetMode::ALL)
-{
- DependencyGraph graph{};
- const auto t0 = graph.add_tensor();
- const auto t1 = graph.add_tensor();
- const auto t2 = graph.add_tensor();
- const auto t3 = graph.add_tensor();
- const auto t4 = graph.add_tensor();
- const auto t5 = graph.add_tensor();
- const auto t6 = graph.add_tensor();
- const auto t7 = graph.add_tensor();
- const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second;
- const auto o1 = graph.add_operator({ t3 }, { t5 }).second;
- const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second;
- const auto o3 = graph.add_operator({ t4 }, { t6 }).second;
- const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second;
-
- ARM_COMPUTE_UNUSED(o1, o3);
-
- ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t3, o2)), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT((graph.path_exists_from_tensor_to_op(t1, o4)), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t2, o4)), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!(graph.path_exists_from_tensor_to_op(t0, o2)), framework::LogLevel::ERRORS);
-
- ARM_COMPUTE_EXPECT((graph.path_exists_from_op_to_op(o0, o2)), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o0)), framework::LogLevel::ERRORS);
-
- ARM_COMPUTE_EXPECT(!(graph.path_exists_from_op_to_op(o2, o4)), framework::LogLevel::ERRORS);
-}
-
-TEST_CASE(Correct_Topological_Sort_Should_Pass, framework::DatasetMode::ALL)
-{
- DependencyGraph graph{};
- const auto t0 = graph.add_tensor();
- const auto t1 = graph.add_tensor();
- const auto t2 = graph.add_tensor();
- const auto t3 = graph.add_tensor();
- const auto t4 = graph.add_tensor();
- const auto t5 = graph.add_tensor();
- const auto t6 = graph.add_tensor();
- const auto t7 = graph.add_tensor();
- const auto o0 = graph.add_operator({ t1 }, { t3, t4 }).second;
- const auto o1 = graph.add_operator({ t3 }, { t5 }).second;
- const auto o2 = graph.add_operator({ t5, t6 }, { t7 }).second;
- const auto o3 = graph.add_operator({ t4 }, { t6 }).second;
- const auto o4 = graph.add_operator({ t0, t5 }, { t2 }).second;
-
- const auto res = graph.topological_sort();
- ARM_COMPUTE_EXPECT(bool(res.first), framework::LogLevel::ERRORS);
- std::vector<DependencyGraph::OpPack> ref_sorted_op_packs
- {
- { o0, { t1 }, { t3, t4 } },
- { o1, { t3 }, { t5 } },
- { o3, { t4 }, { t6 } },
- { o4, { t0, t5 }, { t2 } },
- { o2, { t5, t6 }, { t7 } },
-
- };
- ARM_COMPUTE_EXPECT((res.second == ref_sorted_op_packs), framework::LogLevel::ERRORS);
-}
-
-TEST_CASE(Cycles_Should_Fail, framework::DatasetMode::ALL)
-{
- DependencyGraph graph{};
- const auto t0 = graph.add_tensor();
- const auto t1 = graph.add_tensor();
- const auto t2 = graph.add_tensor();
- const auto t3 = graph.add_tensor();
-
- graph.add_operator({ t0, t1 }, { t2 });
- graph.add_operator({ t2 }, { t1, t3 }); // Ideally error should occur here
-
- const auto res = graph.topological_sort();
- ARM_COMPUTE_EXPECT(!bool(res.first), framework::LogLevel::ERRORS);
-}
-TEST_CASE(Loops_Should_Fail, framework::DatasetMode::ALL)
-{
- DependencyGraph graph{};
- const auto t0 = graph.add_tensor();
- const auto t1 = graph.add_tensor();
- const auto t2 = graph.add_tensor();
-
- ARM_COMPUTE_EXPECT_THROW(graph.add_operator({ t0, t2 }, { t1, t2 }).first, framework::LogLevel::ERRORS);
- ARM_COMPUTE_UNUSED(t0, t1, t2);
-}
-TEST_SUITE_END() // DependencyGraph
-TEST_SUITE_END() // DYNAMIC_FUSION
-TEST_SUITE_END() // UNIT
-
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp
deleted file mode 100644
index 2b8f69e5e7..0000000000
--- a/tests/validation/CL/UNIT/dynamic_fusion/Floor.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/TensorInfo.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
-#include "tests/validation/Validation.h"
-
-#include "tests/validation/reference/Floor.h"
-#include "tests/validation/reference/Permute.h"
-
-#ifdef ARM_COMPUTE_ASSERTS_ENABLED
-#include "tests/SimpleTensorPrinter.h"
-#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-
-using namespace arm_compute::experimental::dynamic_fusion;
-using namespace arm_compute::test::validation::utils;
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(UNIT)
-TEST_SUITE(DYNAMIC_FUSION)
-TEST_CASE(Operator_Floor_1_F32, framework::DatasetMode::ALL)
-{
- /* Computation:
- * out = floor(input)
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto t_shape = TensorShape(32, 16);
- auto t_input_info = TensorInfo(t_shape, 1, data_type, data_layout);
- auto t_dst_info = TensorInfo();
-
- FloorDescriptor floor_desc{};
-
- // Create reference
- SimpleTensor<float> ref_t_input{ t_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
-
- // Fill reference
- fill<float>(ref_t_input, 0, library.get());
-
- auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
- auto t_dst_shape_nchw = t_shape;
- permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
-
- auto ref_t_dst_nchw = reference::floor_layer(ref_t_input_nchw);
- const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
-
- CLScheduler::get().default_reinit();
- const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
- OperatorGraph op_graph;
-
- const auto op_t_input = add_tensor(op_graph, t_input_info);
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- add_op_floor(op_graph, floor_desc, op_t_input, op_t_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- build(workload, op_graph, workload_ctx);
-
- ClCompositeOperator op;
- op.configure(cl_compile_ctx, workload);
-
- // Construct tensors
- CLTensor t_input{};
- CLTensor t_dst{};
-
- // Init tensors
- t_input.allocator()->init(t_input_info);
- t_dst.allocator()->init(t_dst_info);
-
- // Allocate and fill tensors
- t_input.allocator()->allocate();
- t_dst.allocator()->allocate();
- fill<float>(CLAccessor(t_input), 0, library.get());
- // "Pack" tensors
- OpTensorBinding bp_tensors({ { op_t_input, &t_input },
- { op_t_dst, &t_dst }
- });
-
- // Populate prepare and run pack-maps (including allocating aux tensors)
- ClAuxTensorData aux_tensor_data{};
- TensorPackMap prepare_pack_map{};
- TensorPackMap run_pack_map{};
- bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
-
- op.prepare(prepare_pack_map);
- op.run(run_pack_map);
- RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
- validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
-}
-
-TEST_SUITE_END() // DYNAMIC_FUSION
-TEST_SUITE_END() // UNIT
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp b/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
deleted file mode 100644
index 3a8b7c8ce8..0000000000
--- a/tests/validation/CL/UNIT/dynamic_fusion/Integration_OperatorFuseMovenetSubGraph1.cpp
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * Copyright (c) 2022 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifdef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
-#include "arm_compute/core/TensorInfo.h"
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/experimental/ClWorkload.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
-#include "src/core/experimental/dynamic_fusion/WorkloadImpl/ClKernelDescriptors.h"
-#include "src/gpu/cl/operators/ClAdd.h"
-#include "src/gpu/cl/operators/ClConv2d.h"
-#include "tests/CL/CLAccessor.h"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
-#include "tests/validation/Validation.h"
-
-#include "tests/validation/reference/ConvolutionLayer.h"
-#include "tests/validation/reference/ElementwiseOperations.h"
-#include "tests/validation/reference/Permute.h"
-
-#ifdef ARM_COMPUTE_ASSERTS_ENABLED
-#include "tests/SimpleTensorPrinter.h"
-#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-
-using namespace arm_compute::experimental::dynamic_fusion;
-using namespace arm_compute::test::validation::utils;
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-TEST_SUITE(CL)
-TEST_SUITE(INTEGRATION)
-TEST_SUITE(DYNAMIC_FUSION)
-TEST_CASE(Operator_Fuse_Movenet_SubGraph_1_F32, framework::DatasetMode::ALL)
-{
- // Please refer to: https://confluence.arm.com/pages/viewpage.action?pageId=886243697
- /* Computation:
- * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto t_input_shape = TensorShape(384, 12, 12);
- // const auto t_weight_shape = TensorShape(384, 1, 1, 64);
- // const auto t_dst_shape = TensorShape(64, 12, 12);
- const auto t_weight_shape = TensorShape(384, 1, 1, 16);
- const auto t_dst_shape = TensorShape(16, 12, 12);
- auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
- auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
- auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
- auto t_acc_info = TensorInfo(); // Intermediate tensor for cond3
- auto t_dst_info = TensorInfo();
-
- Conv2dDescriptor conv2d_desc{};
- ElementwiseDescriptor add_desc{ ArithmeticOperation::ADD };
-
- // Create reference
- SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
- SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
-
- // Fill reference
- fill<float>(ref_t_input, 0, library.get());
- fill<float>(ref_t_weight, 1, library.get());
- fill<float>(ref_t_l1_addend, 2, library.get());
-
- auto ref_t_input_nchw = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
- auto ref_t_weight_nchw = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
- auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
- auto ref_t_l1_addend_nchw = reference::permute(ref_t_l1_addend, PermutationVector(1U, 2U, 0U));
- auto t_dst_shape_nchw = t_dst_shape;
- permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
-
- PadStrideInfo legacy_pad_stride(conv2d_desc.stride.x(), conv2d_desc.stride.y(), conv2d_desc.pad.left, conv2d_desc.pad.right, conv2d_desc.pad.top, conv2d_desc.pad.bottom, DimensionRoundingType{});
- auto ref_t_dst_nchw = reference::arithmetic_operation(
- ArithmeticOperation::ADD,
- ref_t_l1_addend_nchw,
- reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_desc.dilation),
- data_type,
- ConvertPolicy{});
- const auto ref_t_dst = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
-
- CLScheduler::get().default_reinit();
- const auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
- OperatorGraph op_graph;
-
- const auto op_t_input = add_tensor(op_graph, t_input_info);
- const auto op_t_weight = add_tensor(op_graph, t_weight_info);
- const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info);
- const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
- force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
- add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- build(workload, op_graph, workload_ctx);
-
- ClCompositeOperator op;
- op.configure(cl_compile_ctx, workload);
-
- // Construct tensors
- CLTensor t_input{};
- CLTensor t_weight{};
- CLTensor t_l1_addend{};
- CLTensor t_dst{};
-
- // Init tensors
- t_input.allocator()->init(t_input_info);
- t_weight.allocator()->init(t_weight_info);
- t_l1_addend.allocator()->init(t_dst_info);
- t_dst.allocator()->init(t_dst_info);
-
- // Allocate and fill tensors
- t_input.allocator()->allocate();
- t_weight.allocator()->allocate();
- t_l1_addend.allocator()->allocate();
- t_dst.allocator()->allocate();
- fill<float>(CLAccessor(t_input), 0, library.get());
- fill<float>(CLAccessor(t_weight), 1, library.get());
- fill<float>(CLAccessor(t_l1_addend), 2, library.get());
- // "Pack" tensors
- OpTensorBinding bp_tensors({ { op_t_input, &t_input },
- { op_t_weight, &t_weight },
- { op_t_l1_addend, &t_l1_addend },
- { op_t_dst, &t_dst }
- });
-
- // Populate prepare and run pack-maps (including allocating aux tensors)
- ClAuxTensorData aux_tensor_data{};
- TensorPackMap prepare_pack_map{};
- TensorPackMap run_pack_map{};
- bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, bp_tensors);
-
- op.prepare(prepare_pack_map);
- op.run(run_pack_map);
- RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
- validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
-}
-TEST_SUITE(Unsupported)
-TEST_CASE(DataType_QASYMM8, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::QASYMM8;
- const auto data_layout = DataLayout::NHWC;
- const auto t_input_shape = TensorShape(384, 12, 12);
- const auto t_weight_shape = TensorShape(384, 1, 1, 64);
- const auto t_dst_shape = TensorShape(64, 12, 12);
- auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
- auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
- auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
- auto t_acc_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
- auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
-
- Conv2dDescriptor conv2d_desc{};
- ElementwiseDescriptor add_desc{};
-
- OperatorGraph op_graph;
-
- const auto op_t_input = add_tensor(op_graph, t_input_info);
- const auto op_t_weight = add_tensor(op_graph, t_weight_info);
- const auto op_t_l1_addend = add_tensor(op_graph, t_l1_addend_info);
- const auto op_t_acc = add_tensor(op_graph, t_acc_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_acc);
- add_op_elementwise_op(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
- force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- const auto success = build(workload, op_graph, workload_ctx);
-
- ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
-}
-TEST_CASE(DataLayout_NCHW, framework::DatasetMode::ALL)
-{
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NCHW;
- const auto t_input_shape = TensorShape(384, 12, 12);
- const auto t_weight_shape = TensorShape(384, 1, 1, 64);
- const auto t_dst_shape = TensorShape(64, 12, 12);
- auto t_input_info = TensorInfo(t_input_shape, 1, data_type, data_layout);
- auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
- auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
-
- Conv2dDescriptor conv2d_desc{};
-
- OperatorGraph op_graph;
-
- const auto op_t_input = add_tensor(op_graph, t_input_info);
- const auto op_t_weight = add_tensor(op_graph, t_weight_info);
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- auto conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_dst);
- force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT);
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- const auto success = build(workload, op_graph, workload_ctx);
-
- ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Unsupported
-
-TEST_SUITE(Invalid)
-TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
-{
- /* Computation:
- * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto t_l0_input_shape = TensorShape(1024, 56, 56);
- const auto t_l0_weight_shape = TensorShape(512, 1024, 1, 1);
- const auto t_l1_weight_shape = TensorShape(512, 256, 1, 1);
-
- auto t_l0_input_info = TensorInfo(t_l0_input_shape, 1, data_type, data_layout);
- auto t_l0_weight_info = TensorInfo(t_l0_weight_shape, 1, data_type, data_layout);
- auto t_l1_weight_info = TensorInfo(t_l1_weight_shape, 1, data_type, data_layout);
- auto t_l0_dst_info = TensorInfo();
- auto t_dst_info = TensorInfo();
-
- OperatorGraph op_graph;
- const auto conv2d_desc = Conv2dDescriptor{};
-
- const auto op_t_l0_input = add_tensor(op_graph, t_l0_input_info);
- const auto op_t_l0_weight = add_tensor(op_graph, t_l0_weight_info);
- const auto op_t_l1_weight = add_tensor(op_graph, t_l1_weight_info);
- const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- add_op_conv2d(op_graph, conv2d_desc, op_t_l0_input, op_t_l0_weight, op_t_l0_dst);
- add_op_conv2d(op_graph, conv2d_desc, op_t_l0_dst, op_t_l1_weight, op_t_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- const auto success = build(workload, op_graph, workload_ctx);
-
- ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
-}
-TEST_CASE(Enlarging_Execution_Space, framework::DatasetMode::ALL)
-{
- /* Computation:
- * out = add(l2_lhs, add(add(l0_lhs, l0_rhs), l1_rhs))
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
- const auto t_l0_lhs_shape = TensorShape(1, 256, 3);
- const auto t_l0_rhs_shape = TensorShape(1, 256, 3);
- const auto t_l1_rhs_shape = TensorShape(1, 1, 3);
- const auto t_l2_lhs_shape = TensorShape(1024, 1, 3);
-
- auto t_l0_lhs_info = TensorInfo(t_l0_lhs_shape, 1, data_type, data_layout);
- auto t_l0_rhs_info = TensorInfo(t_l0_rhs_shape, 1, data_type, data_layout);
- auto t_l1_rhs_info = TensorInfo(t_l1_rhs_shape, 1, data_type, data_layout);
- auto t_l2_lhs_info = TensorInfo(t_l2_lhs_shape, 1, data_type, data_layout);
- auto t_l0_dst_info = TensorInfo();
- auto t_l1_dst_info = TensorInfo();
- auto t_dst_info = TensorInfo();
-
- OperatorGraph op_graph;
- const auto add_desc = ElementwiseDescriptor{};
-
- const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
- const auto op_t_l0_rhs = add_tensor(op_graph, t_l0_rhs_info);
- const auto op_t_l1_rhs = add_tensor(op_graph, t_l1_rhs_info);
- const auto op_t_l2_lhs = add_tensor(op_graph, t_l2_lhs_info);
- const auto op_t_l0_dst = add_tensor(op_graph, t_l0_dst_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_l1_dst = add_tensor(op_graph, t_l1_dst_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- add_op_elementwise_op(op_graph, add_desc, op_t_l0_lhs, op_t_l0_rhs, op_t_l0_dst);
- add_op_elementwise_op(op_graph, add_desc, op_t_l0_dst, op_t_l1_rhs, op_t_l1_dst);
- add_op_elementwise_op(op_graph, add_desc, op_t_l1_dst, op_t_l2_lhs, op_t_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- const auto success = build(workload, op_graph, workload_ctx);
-
- ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
-}
-TEST_CASE(Root_Simple_And_Complex, framework::DatasetMode::ALL)
-{
- /* Computation:
- * out = add(conv(l0_0_input, l0_0_weight), add(l0_1_lhs, l0_1_rhs))
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
-
- const auto t_l0_0_input_shape = TensorShape(128, 21, 21);
- const auto t_l0_0_weight_shape = TensorShape(144, 128, 1, 1);
- const auto t_l0_1_lhs_shape = TensorShape(144, 21, 21);
- const auto t_l0_1_rhs_shape = TensorShape(1, 1, 21);
-
- auto t_l0_0_input_info = TensorInfo(t_l0_0_input_shape, 1, data_type, data_layout);
- auto t_l0_0_weight_info = TensorInfo(t_l0_0_weight_shape, 1, data_type, data_layout);
- auto t_l0_1_lhs_info = TensorInfo(t_l0_1_lhs_shape, 1, data_type, data_layout);
- auto t_l0_1_rhs_info = TensorInfo(t_l0_1_rhs_shape, 1, data_type, data_layout);
- auto t_l0_0_dst_info = TensorInfo();
- auto t_l0_1_dst_info = TensorInfo();
- auto t_dst_info = TensorInfo();
-
- OperatorGraph op_graph;
- const auto conv2d_desc = Conv2dDescriptor{};
- const auto add_desc = ElementwiseDescriptor{};
-
- const auto op_t_l0_0_input = add_tensor(op_graph, t_l0_0_input_info);
- const auto op_t_l0_0_weight = add_tensor(op_graph, t_l0_0_weight_info);
- const auto op_t_l0_1_lhs = add_tensor(op_graph, t_l0_1_lhs_info);
- const auto op_t_l0_1_rhs = add_tensor(op_graph, t_l0_1_rhs_info);
- const auto op_t_l0_0_dst = add_tensor(op_graph, t_l0_0_dst_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_l0_1_dst = add_tensor(op_graph, t_l0_1_dst_info); // temp accumulator; TensorInfo to be inferred
- const auto op_t_dst = add_tensor(op_graph, t_dst_info);
-
- add_op_conv2d(op_graph, conv2d_desc, op_t_l0_0_input, op_t_l0_0_weight, op_t_l0_0_dst);
- add_op_elementwise_op(op_graph, add_desc, op_t_l0_1_lhs, op_t_l0_1_rhs, op_t_l0_1_dst);
- add_op_elementwise_op(op_graph, add_desc, op_t_l0_0_dst, op_t_l0_1_dst, op_t_dst);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- const auto success = build(workload, op_graph, workload_ctx);
-
- ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
-}
-TEST_CASE(Loop, framework::DatasetMode::ALL)
-{
- /* Computation:
- * tensor state0;
- * state1 = add(l0_lhs, state0)
- * state0 = add(l1_lhs, state1)
- */
- const auto data_type = DataType::F32;
- const auto data_layout = DataLayout::NHWC;
-
- const auto t_shape = TensorShape(13, 21);
-
- auto t_l0_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout);
- auto t_l1_lhs_info = TensorInfo(t_shape, 1, data_type, data_layout);
- auto state0_info = TensorInfo(t_shape, 1, data_type, data_layout);
- auto state1_info = TensorInfo();
-
- OperatorGraph op_graph;
- const auto conv2d_desc = Conv2dDescriptor{};
- const auto add_desc = ElementwiseDescriptor{};
-
- const auto op_t_l0_lhs = add_tensor(op_graph, t_l0_lhs_info);
- const auto op_t_l1_lhs = add_tensor(op_graph, t_l1_lhs_info);
- const auto op_t_state0 = add_tensor(op_graph, state0_info);
- const auto op_t_state1 = add_tensor(op_graph, state1_info);
-
- add_op_conv2d(op_graph, conv2d_desc, op_t_l0_lhs, op_t_state0, op_t_state1);
- add_op_elementwise_op(op_graph, add_desc, op_t_l1_lhs, op_t_state1, op_t_state0);
-
- const ClWorkloadContext workload_ctx{ GpuInfo{ CLScheduler::get().target() } };
- ClWorkload workload;
- const auto success = build(workload, op_graph, workload_ctx);
-
- ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
- ARM_COMPUTE_EXPECT(!bool(ClCompositeOperator::validate(workload)), framework::LogLevel::ERRORS);
-}
-TEST_SUITE_END() // Invalid
-
-TEST_SUITE_END() // DYNAMIC_FUSION
-TEST_SUITE_END() // INTEGRATION
-TEST_SUITE_END() // CL
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ \ No newline at end of file
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h b/tests/validation/dynamic_fusion/Utils.h
index 4512305c1e..72e9ec5955 100644
--- a/tests/validation/CL/UNIT/dynamic_fusion/Utils.h
+++ b/tests/validation/dynamic_fusion/Utils.h
@@ -21,8 +21,9 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS
-#define TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS
+
+#ifndef TESTS_VALIDATION_DYNAMIC_FUSION_UTILS
+#define TESTS_VALIDATION_DYNAMIC_FUSION_UTILS
#include "tests/AssetsLibrary.h"
#include "utils/Utils.h"
@@ -68,4 +69,5 @@ void fill(U &&tensor, int seed, AssetsLibrary *library)
} // namespace validation
} // namespace test
} // namespace arm_compute
-#endif //TESTS_VALIDATION_CL_DYNAMICFUSION_UTILS \ No newline at end of file
+
+#endif /* TESTS_VALIDATION_DYNAMIC_FUSION_UTILS */
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
index 6ee2e20d35..036f28b29f 100644
--- a/tests/validation/dynamic_fusion/gpu/Integration.cpp
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -36,8 +36,8 @@
#include "tests/CL/CLAccessor.h"
#include "tests/framework/Asserts.h"
#include "tests/framework/Macros.h"
-#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
#include "tests/validation/Validation.h"
+#include "tests/validation/dynamic_fusion/Utils.h"
#include "tests/validation/reference/ConvolutionLayer.h"
#include "tests/validation/reference/ElementwiseOperations.h"
#include "tests/validation/reference/Permute.h"