From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- .../runtime/gpu/cl/ClKernelRuntime.cpp | 61 ++++--- .../runtime/gpu/cl/ClKernelRuntime.h | 11 +- .../runtime/gpu/cl/ClWorkloadRuntime.cpp | 81 ++++----- .../cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp | 7 +- .../cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h | 5 +- src/dynamic_fusion/sketch/ArgumentPack.h | 45 +++-- .../attributes/DepthwiseConv2dAttributes.cpp | 3 +- .../sketch/attributes/Pool2dAttributes.cpp | 1 + src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h | 15 +- .../sketch/gpu/GpuKernelComponentGraph.cpp | 20 +-- .../sketch/gpu/GpuKernelComponentGraph.h | 18 +- .../sketch/gpu/GpuKernelComponentGroup.cpp | 105 ++++++------ .../sketch/gpu/GpuKernelComponentGroup.h | 23 ++- .../sketch/gpu/GpuKernelComponentStream.cpp | 10 +- .../sketch/gpu/GpuKernelComponentStream.h | 5 +- .../sketch/gpu/GpuKernelSourceCode.h | 1 + src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp | 10 +- src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp | 48 +++--- src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h | 4 +- .../sketch/gpu/GpuWorkloadContext.cpp | 16 +- .../sketch/gpu/GpuWorkloadContextImpl.h | 6 +- .../sketch/gpu/GpuWorkloadSketch.cpp | 4 +- .../sketch/gpu/GpuWorkloadSketchImpl.h | 11 +- .../sketch/gpu/GpuWorkloadSourceCode.h | 56 ++++--- src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h | 1 + .../gpu/ckw_driver/GpuCkwComponentArgument.cpp | 7 +- .../gpu/ckw_driver/GpuCkwComponentArgument.h | 6 +- .../sketch/gpu/ckw_driver/GpuCkwDriver.cpp | 21 ++- .../sketch/gpu/ckw_driver/GpuCkwDriver.h | 4 +- .../sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp | 12 +- .../gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp | 1 + .../gpu/ckw_driver/GpuCkwScopedKernelWriter.h | 2 +- .../sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp | 20 ++- .../sketch/gpu/ckw_driver/GpuCkwVariableTable.h | 8 +- .../sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h | 10 +- .../gpu/ckw_driver/components/GpuCkwActivation.cpp | 34 ++-- .../gpu/ckw_driver/components/GpuCkwActivation.h | 10 +- .../gpu/ckw_driver/components/GpuCkwCast.cpp | 44 ++--- .../sketch/gpu/ckw_driver/components/GpuCkwCast.h | 10 +- .../ckw_driver/components/GpuCkwDirectConv2d.cpp | 49 +++--- .../components/GpuCkwElementwiseBinary.cpp | 84 ++++++---- .../components/GpuCkwElementwiseBinary.h | 14 +- .../gpu/ckw_driver/components/GpuCkwPool2d.cpp | 171 ++++++++++--------- .../gpu/ckw_driver/components/GpuCkwPool2d.h | 8 +- .../gpu/ckw_driver/components/GpuCkwResize.cpp | 76 +++++---- .../gpu/ckw_driver/components/GpuCkwStore.cpp | 10 +- .../sketch/gpu/ckw_driver/components/GpuCkwStore.h | 6 +- .../gpu/ckw_driver/components/utils/WriterHelper.h | 31 +++- .../components/utils/type_converter/Common.h | 35 ++-- .../utils/type_converter/ElementwiseBinary.h | 3 +- .../gpu/components/GpuKernelComponentFactory.h | 7 +- .../sketch/gpu/components/IGpuKernelComponent.h | 15 +- .../gpu/components/cl/ClComponentActivation.cpp | 12 +- .../gpu/components/cl/ClComponentActivation.h | 18 +- .../sketch/gpu/components/cl/ClComponentCast.cpp | 30 ++-- .../sketch/gpu/components/cl/ClComponentCast.h | 10 +- .../components/cl/ClComponentDepthwiseConv2d.cpp | 57 ++++--- .../gpu/components/cl/ClComponentDepthwiseConv2d.h | 34 ++-- .../gpu/components/cl/ClComponentDirectConv2d.cpp | 64 ++++---- .../gpu/components/cl/ClComponentDirectConv2d.h | 26 +-- .../components/cl/ClComponentElementwiseBinary.cpp | 65 ++++---- .../components/cl/ClComponentElementwiseBinary.h | 12 +- .../cl/ClComponentLogits1DMaxShiftExpSum.cpp | 14 +- .../cl/ClComponentLogits1DMaxShiftExpSum.h | 7 +- .../gpu/components/cl/ClComponentLogits1DNorm.cpp | 14 +- .../gpu/components/cl/ClComponentLogits1DNorm.h | 7 +- .../sketch/gpu/components/cl/ClComponentPool2d.cpp | 49 +++--- .../sketch/gpu/components/cl/ClComponentPool2d.h | 21 ++- .../gpu/components/cl/ClComponentReshape.cpp | 12 +- .../sketch/gpu/components/cl/ClComponentReshape.h | 5 +- .../sketch/gpu/components/cl/ClComponentResize.cpp | 14 +- .../sketch/gpu/components/cl/ClComponentResize.h | 13 +- .../sketch/gpu/components/cl/ClComponentStore.cpp | 20 +-- .../sketch/gpu/components/cl/ClComponentStore.h | 5 +- .../utils/type_printer/ElementwiseBinary.h | 22 ++- src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp | 19 +-- .../sketch/gpu/operators/GpuCast.cpp | 52 +++--- .../sketch/gpu/operators/GpuClamp.cpp | 44 ++--- .../sketch/gpu/operators/GpuConv2d.cpp | 65 ++++---- .../sketch/gpu/operators/GpuDepthwiseConv2d.cpp | 90 +++++----- src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp | 13 +- .../sketch/gpu/operators/GpuOutput.cpp | 19 +-- .../sketch/gpu/operators/GpuPool2d.cpp | 43 +++-- .../sketch/gpu/operators/GpuReshape.cpp | 27 ++- .../sketch/gpu/operators/GpuResize.cpp | 40 ++--- .../sketch/gpu/operators/GpuSigmoid.cpp | 31 ++-- .../sketch/gpu/operators/GpuSoftmax.cpp | 38 +++-- src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp | 19 +-- .../sketch/gpu/operators/GpuTanh.cpp | 35 ++-- .../internal/GpuElementwiseBinaryCommon.cpp | 19 ++- .../gpu/template_writer/GpuKernelVariableTable.cpp | 36 ++-- .../gpu/template_writer/GpuKernelVariableTable.h | 17 +- .../template_writer/IGpuTemplateComponentWriter.h | 8 +- .../template_writer/cl/ClTemplateActivation.cpp | 26 +-- .../gpu/template_writer/cl/ClTemplateActivation.h | 1 + .../gpu/template_writer/cl/ClTemplateCast.cpp | 30 ++-- .../cl/ClTemplateDepthwiseConv2d.cpp | 81 ++++----- .../template_writer/cl/ClTemplateDepthwiseConv2d.h | 1 + .../template_writer/cl/ClTemplateDirectConv2d.cpp | 112 ++++++------- .../template_writer/cl/ClTemplateDirectConv2d.h | 1 + .../cl/ClTemplateElementwiseBinary.cpp | 94 +++++------ .../cl/ClTemplateElementwiseBinary.h | 5 +- .../cl/ClTemplateLogits1DMaxShiftExpSum.cpp | 57 +++---- .../cl/ClTemplateLogits1DMaxShiftExpSum.h | 4 +- .../template_writer/cl/ClTemplateLogits1DNorm.cpp | 35 ++-- .../gpu/template_writer/cl/ClTemplatePool2d.cpp | 92 +++++------ .../gpu/template_writer/cl/ClTemplatePool2d.h | 1 + .../gpu/template_writer/cl/ClTemplateReshape.cpp | 28 ++-- .../gpu/template_writer/cl/ClTemplateReshape.h | 4 +- .../gpu/template_writer/cl/ClTemplateResize.cpp | 56 ++++--- .../gpu/template_writer/cl/ClTemplateStore.cpp | 16 +- .../gpu/template_writer/cl/ClTemplateStore.h | 1 + .../gpu/template_writer/cl/ClTemplateWriter.cpp | 59 ++++--- src/dynamic_fusion/sketch/utils/DependencyGraph.h | 182 ++++++++++----------- src/dynamic_fusion/utils/Utils.h | 16 +- 115 files changed, 1637 insertions(+), 1676 deletions(-) (limited to 'src/dynamic_fusion') diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp index 15a5632d0b..9ca20fa152 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp @@ -22,14 +22,15 @@ * SOFTWARE. */ #include "ClKernelRuntime.h" + #include "arm_compute/core/CL/ICLTensor.h" + #include "src/core/CL/CLUtils.h" #ifdef ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h" #endif // ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" #include "src/gpu/cl/ClKernelLibrary.h" - #include "support/Cast.h" namespace arm_compute { @@ -43,13 +44,12 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe { // Create kernel from kernel source string opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get(); - _kernel = static_cast(compile_ctx.create_kernel(code.name(), - code.name(), // program name has to be provided to differentiate between different unfusable components' kernels. - // Each program contains exactly one kernel - code.code(), - klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */, - code.build_options().options(), - false /* Is source binary */)); + _kernel = static_cast(compile_ctx.create_kernel( + code.name(), + code.name(), // program name has to be provided to differentiate between different unfusable components' kernels. + // Each program contains exactly one kernel + code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */, + code.build_options().options(), false /* Is source binary */)); // Configure execution window IClKernel::configure_internal(code.window()); @@ -63,11 +63,15 @@ void ClKernelRuntime::configure(const ClCompileContext &compile_ctx, const GpuKe #ifndef ACL_INTERNAL_TEST_CKW_IN_DF -inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images) +inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, + const GpuKernelArgumentInfo &arg, + const ICLTensor *tensor, + const Window &arg_slice, + std::vector &cl_images) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); - switch(arg.type) + switch (arg.type) { case GpuKernelArgumentInfo::Type::Scalar: { @@ -95,9 +99,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: { - const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); + const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * + tensor->info()->dimension(2) * + tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + cl::Image2D tensor_image2d = + create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, + tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); break; @@ -111,9 +119,13 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: { - const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); + const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * + tensor->info()->dimension(2) * + tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + cl::Image2D tensor_image2d = + create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, + tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); @@ -142,8 +154,9 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer const size_t image_h = tensor->info()->tensor_shape().total_size_upper(1); const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), - TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly); + cl::Image2D tensor_image2d = create_image2d_from_buffer( + CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h), + tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); @@ -170,13 +183,16 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer } #else // ACL_INTERNAL_TEST_CKW_IN_DF -inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector &cl_images) +inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, + const GpuKernelArgumentBinding &arg, + const ICLTensor *tensor, + std::vector &cl_images) { - switch(arg.type()) + switch (arg.type()) { case GpuKernelArgumentBinding::Type::TensorStorage: { - switch(arg.tensor_storage_type()) + switch (arg.tensor_storage_type()) { case TensorStorageType::ClBufferUint8Ptr: { @@ -238,7 +254,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com // CLImages created from tensor arguments. Need to be retained until enqueue std::vector cl_images; #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - for(auto id_arg : _arguments) + for (auto id_arg : _arguments) { const auto arg = id_arg.second; auto tensor = utils::cast::polymorphic_downcast(tensors.get_tensor(id_arg.first)); @@ -248,7 +264,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com } #else // ACL_INTERNAL_TEST_CKW_IN_DF - for(const auto &arg : _arguments) + for (const auto &arg : _arguments) { auto tensor = utils::cast::polymorphic_downcast(tensors.get_tensor(arg.id())); ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); @@ -259,8 +275,7 @@ void ClKernelRuntime::run_op(ITensorPack &tensors, const Window &window, cl::Com // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items); - } - while(skip_sliding_window && window.slide_window_slice_3D(slice)); + } while (skip_sliding_window && window.slide_window_slice_3D(slice)); } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h index 92e73503ce..e78567eb9d 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h @@ -68,7 +68,11 @@ private: * @param[in] arg_slice Window the kernel will be run on * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) */ - inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector &cl_images); + inline void add_tensor_argument(unsigned int &idx, + const GpuKernelArgumentInfo &arg, + const ICLTensor *tensor, + const Window &arg_slice, + std::vector &cl_images); #else // ACL_INTERNAL_TEST_CKW_IN_DF /** Set a kernel argument as part of a tensor * @@ -77,7 +81,10 @@ private: * @param[in] tensor Tensor of which the kernel argument @p arg is a part of * @param[out] cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued) */ - inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector &cl_images); + inline void add_kernel_argument(unsigned int &idx, + const GpuKernelArgumentBinding &arg, + const ICLTensor *tensor, + std::vector &cl_images); #endif // ACL_INTERNAL_TEST_CKW_IN_DF private: diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp index cd21b10180..ba39ff4c9d 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" @@ -55,14 +56,14 @@ public: { DataView() = default; DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info) - : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info } + : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info} { } - ~DataView() = default; - DataView(const DataView &other) = default; + ~DataView() = default; + DataView(const DataView &other) = default; DataView &operator=(const DataView &other) = default; DataView(DataView &&other) = default; - DataView &operator=(DataView &&other) = default; + DataView &operator=(DataView &&other) = default; CLTensor *tensor{}; /**< Pointer to the auxiliary tensor */ TensorInfo tensor_info{}; /**< Associated tensor info */ AuxMemoryInfo memory_info{}; /**< Memory requirement */ @@ -92,7 +93,7 @@ private: { const auto t_id = tensor_info.id(); auto find_tensor_pair = _owned_tensors.find(t_id); - if(find_tensor_pair != _owned_tensors.end()) + if (find_tensor_pair != _owned_tensors.end()) { return find_tensor_pair->second.get(); } @@ -107,7 +108,7 @@ private: } std::map> _owned_tensors{}; - std::vector _tensors{}; + std::vector _tensors{}; }; /** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode * @@ -120,12 +121,12 @@ private: */ Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code) { - for(auto t_id : code.tensors()) + for (auto t_id : code.tensors()) { // Get tensor object const auto workload_arg = code.query_tensor(t_id); ICLTensor *tensor_object = nullptr; - if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary) + if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary) { // Create aux tensor CLTensor object const TensorInfo tensor_info = *workload_arg->tensor_info(); @@ -133,7 +134,7 @@ Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info; tensor_object = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info); - if(tensor_object == nullptr) + if (tensor_object == nullptr) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor"); } @@ -156,7 +157,7 @@ public: ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id) { auto tensor_pack = _tensor_packs.find(uwk_id); - if(tensor_pack != _tensor_packs.end()) + if (tensor_pack != _tensor_packs.end()) { return &(tensor_pack->second); } @@ -173,7 +174,10 @@ public: return _tensor_packs.at(uwk_id); } - friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector &user_tensors, const ClAuxTensors &aux_tensors); + friend Status create_tensor_lut(ClTensorLUT *tensor_lut, + const GpuWorkloadSourceCode &code, + const std::vector &user_tensors, + const ClAuxTensors &aux_tensors); private: /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id @@ -197,19 +201,22 @@ private: * * @return Status */ -Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector &user_tensors, const ClAuxTensors &aux_tensors) +Status create_tensor_lut(ClTensorLUT *tensor_lut, + const GpuWorkloadSourceCode &code, + const std::vector &user_tensors, + const ClAuxTensors &aux_tensors) { // Combine user tensors and aux tensors std::map tensor_map; - for(auto tensor : user_tensors) + for (auto tensor : user_tensors) { const auto t_id = tensor->info()->id(); - if(tensor_map.find(t_id) != tensor_map.end()) + if (tensor_map.find(t_id) != tensor_map.end()) { // In case of elementwise in-place: give another Id to the In/Out tensor when passed again std::vector ids; - for(auto &t : tensor_map) + for (auto &t : tensor_map) { ids.push_back(t.first); } @@ -221,11 +228,11 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c tensor_map[t_id] = tensor; } } - for(const auto &data : aux_tensors.get_tensors()) + for (const auto &data : aux_tensors.get_tensors()) { const auto t_id = data.tensor_info.id(); const auto tensor = data.tensor; - if(tensor_map.find(t_id) != tensor_map.end()) + if (tensor_map.find(t_id) != tensor_map.end()) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids"); } @@ -233,25 +240,25 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c } // Add tensor objects into corresponding tensor packs - for(auto id_tensor : tensor_map) + for (auto id_tensor : tensor_map) { const auto t_id = id_tensor.first; const auto tensor_object = id_tensor.second; - if(tensor_object == nullptr) + if (tensor_object == nullptr) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs"); } - if(tensor_object->allocator()->info().total_size() == 0U) + if (tensor_object->allocator()->info().total_size() == 0U) { return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor"); } - for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id)) + for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id)) { ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id); - if(tensor_pack == nullptr) + if (tensor_pack == nullptr) { - tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } }); + tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}}); } else { @@ -269,15 +276,14 @@ struct ClWorkloadRuntime::Implementation { std::map> _kernels{}; std::map> _kernels_prep{}; - bool _is_configured{ false }; - bool _is_prepared{ false }; - ClTensorLUT _tensor_lut{}; - ClAuxTensors _aux_tensors{}; - GpuWorkloadSourceCode _source_code{}; + bool _is_configured{false}; + bool _is_prepared{false}; + ClTensorLUT _tensor_lut{}; + ClAuxTensors _aux_tensors{}; + GpuWorkloadSourceCode _source_code{}; }; -ClWorkloadRuntime::ClWorkloadRuntime() - : _impl{ std::make_unique() } +ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique()} { } @@ -286,18 +292,19 @@ ClWorkloadRuntime::~ClWorkloadRuntime() = default; Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, + "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch"); // Generate source code _impl->_source_code = sketch.implementation().generate_source_code(); // Configure unit workload from source code - for(auto uwk_id : _impl->_source_code.unit_workloads()) + for (auto uwk_id : _impl->_source_code.unit_workloads()) { const auto work = _impl->_source_code.query_unit_workload(uwk_id); const auto stage = work.stage().stage; auto k = std::make_unique(); k->configure(*sketch.gpu_context()->cl_compile_context(), work.code()); - switch(stage) + switch (stage) { case UnitWorkloadStage::Stage::Run: { @@ -323,9 +330,9 @@ Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch) void ClWorkloadRuntime::prepare() { - if(!_impl->_is_prepared) + if (!_impl->_is_prepared) { - for(auto &id_kernel_pair : _impl->_kernels_prep) + for (auto &id_kernel_pair : _impl->_kernels_prep) { const bool flush_queue = false; const auto uwk_id = id_kernel_pair.first; @@ -344,7 +351,7 @@ Status ClWorkloadRuntime::run(const std::vector &tensors) const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors); ARM_COMPUTE_RETURN_ON_ERROR(st); prepare(); - for(auto &id_kernel_pair : _impl->_kernels) + for (auto &id_kernel_pair : _impl->_kernels) { // Flush the command queue on the last kernel const bool flush_queue = false; @@ -358,7 +365,7 @@ Status ClWorkloadRuntime::run(const std::vector &tensors) std::vector> ClWorkloadRuntime::get_auxiliary_tensors() { std::vector> aux_tensors; - for(const auto &data : _impl->_aux_tensors.get_tensors()) + for (const auto &data : _impl->_aux_tensors.get_tensors()) { aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info); } diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp index 84fb279237..7044b0ea66 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp @@ -30,14 +30,17 @@ namespace experimental { namespace dynamic_fusion { -void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component) +void cl_add_tensor_component_argument(cl::Kernel &kernel, + unsigned int &idx, + const ICLTensor *tensor, + TensorComponentType component) { ARM_COMPUTE_ERROR_ON(tensor == nullptr); const auto *info = tensor->info(); const auto &strides = info->strides_in_bytes(); - switch(component) + switch (component) { case TensorComponentType::OffsetFirstElement: kernel.setArg(idx++, info->offset_first_element_in_bytes()); diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h index 4cbb157a48..306d547acb 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h +++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h @@ -42,7 +42,10 @@ namespace dynamic_fusion * @param[in] tensor Tensor from which to access the tensor component. * @param[in] component Tensor component to select such as tensor dimensions, strides, etc. */ -void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component); +void cl_add_tensor_component_argument(cl::Kernel &kernel, + unsigned int &idx, + const ICLTensor *tensor, + TensorComponentType component); /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx. * diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h index f118d7d851..3bf380b1ec 100644 --- a/src/dynamic_fusion/sketch/ArgumentPack.h +++ b/src/dynamic_fusion/sketch/ArgumentPack.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK #include "arm_compute/core/experimental/Types.h" + #include #include @@ -52,26 +53,21 @@ public: */ struct PackElement { - PackElement() = default; - PackElement(const PackElement &elem) = default; + PackElement() = default; + PackElement(const PackElement &elem) = default; PackElement &operator=(const PackElement &elem) = default; PackElement(PackElement &&elem) = default; - PackElement &operator=(PackElement &&elem) = default; - PackElement(Id id, T *tensor) - : id(id), tensor(tensor), ctensor(nullptr) + PackElement &operator=(PackElement &&elem) = default; + PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr) { } - PackElement(Id id, const T *ctensor) - : id(id), tensor(nullptr), ctensor(ctensor) + PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor) { } - Id id{ ACL_UNKNOWN }; /**< Argument id within the pack */ - T *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */ - const T *ctensor - { - nullptr - }; /**< Const pointer to tensor-related object */ + Id id{ACL_UNKNOWN}; /**< Argument id within the pack */ + T *tensor{nullptr}; /**< Non-const pointer to tensor-related object */ + const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */ }; public: @@ -88,10 +84,9 @@ public: /** Allow instances of this class to be moved */ ArgumentPack &operator=(ArgumentPack &&other) = default; /** Initializer list Constructor */ - ArgumentPack(const std::initializer_list &l) - : _pack{} + ArgumentPack(const std::initializer_list &l) : _pack{} { - for(const auto &e : l) + for (const auto &e : l) { _pack[e.id] = e; } @@ -134,7 +129,7 @@ public: const T *get_const_tensor(Id id) const { auto it = _pack.find(id); - if(it != _pack.end()) + if (it != _pack.end()) { return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor; } @@ -171,10 +166,10 @@ public: std::vector get_src_tensors() { std::vector src_tensors{}; - for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) + for (int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) { auto tensor = get_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { src_tensors.push_back(tensor); } @@ -188,10 +183,10 @@ public: std::vector get_const_src_tensors() const { std::vector src_tensors{}; - for(int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) + for (int id = static_cast(TensorType::ACL_SRC); id <= static_cast(TensorType::ACL_SRC_END); ++id) { auto tensor = get_const_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { src_tensors.push_back(tensor); } @@ -205,10 +200,10 @@ public: std::vector get_dst_tensors() { std::vector dst_tensors{}; - for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) + for (int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) { auto tensor = get_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { dst_tensors.push_back(tensor); } @@ -222,10 +217,10 @@ public: std::vector get_const_dst_tensors() const { std::vector dst_tensors{}; - for(int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) + for (int id = static_cast(TensorType::ACL_DST); id <= static_cast(TensorType::ACL_DST_END); ++id) { auto tensor = get_const_tensor(static_cast(id)); - if(tensor != nullptr) + if (tensor != nullptr) { dst_tensors.push_back(tensor); } diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp index 3a5657e07b..6f3816568c 100644 --- a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp +++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp @@ -69,7 +69,8 @@ uint32_t DepthwiseConv2dAttributes::depth_multiplier() const return _depth_multiplier; } -DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) +DepthwiseConv2dAttributes & +DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) { _dimension_rounding_type = dimension_rounding_type; return *this; diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp index c28791f5fe..80f65f926a 100644 --- a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp +++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" + #include "arm_compute/core/Size2D.h" namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h index 226e1a2df3..03817173f4 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h @@ -61,11 +61,10 @@ struct GpuKernelArgumentInfo /** Default constructor */ GpuKernelArgumentInfo() = default; /** Constructor */ - GpuKernelArgumentInfo(Type type) - : type{ type } + GpuKernelArgumentInfo(Type type) : type{type} { } - Type type{ Type::Tensor_4D_t_Buffer }; + Type type{Type::Tensor_4D_t_Buffer}; }; bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1); /** Kernel argument information linked with its corresponding @ref ITensorInfo @@ -79,10 +78,8 @@ public: * @param[in] tensor_info Associated @ref ITensorInfo * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo */ - GpuKernelArgument(const ITensorInfo &tensor_info, - const GpuKernelArgumentInfo &kernel_arg_info) - : _tensor_info{ tensor_info }, - _kernel_arg_info{ kernel_arg_info } + GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info) + : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info} { } /** Get workload tensor id */ @@ -200,12 +197,12 @@ public: TensorComponent /** @ref TensorComponentType */ }; GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage) - : _type{ Type::TensorStorage }, _id{ id }, _value{} + : _type{Type::TensorStorage}, _id{id}, _value{} { _value.tensor_storage_type = storage; } GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component) - : _type{ Type::TensorComponent }, _id{ id }, _value{} + : _type{Type::TensorComponent}, _id{id}, _value{} { _value.tensor_component_type = component; } diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp index 5a65ede38b..1a458c9862 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp @@ -31,35 +31,31 @@ namespace experimental { namespace dynamic_fusion { -std::vector GpuKernelComponentGraph::get_tensor_ids(const std::vector tensors) +std::vector +GpuKernelComponentGraph::get_tensor_ids(const std::vector tensors) { std::vector tensor_ids{}; - std::transform( - std::begin(tensors), std::end(tensors), - std::back_inserter(tensor_ids), - [](const auto & t) - { - return t->id(); - }); + std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids), + [](const auto &t) { return t->id(); }); return tensor_ids; } GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services) - : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{} + : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{} { } GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const { - GpuKernelComponentStream stream{ _context, _services, mem_map }; + GpuKernelComponentStream stream{_context, _services, mem_map}; const auto op_seq = _dependency_graph.build_operators_sequence(); stream.new_component_group(); - for(auto op : op_seq) + for (auto op : op_seq) { const auto component = _components.at(op.op).get(); const auto success = stream.add_component(component); - if(!success) // Assume first failure was because the root component is unfusable + if (!success) // Assume first failure was because the root component is unfusable { stream.new_component_group(); const auto success = stream.add_component(component); diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h index 85c9b45840..6f871a3c90 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h @@ -70,21 +70,21 @@ public: * @param[in] args Component arguments except for component id, which is auto-allocated */ template - void add_new_component(Args &&... args) + void add_new_component(Args &&...args) { - auto comp = _services->component_factory().create(std::forward(args)...); - ArgumentPack tensors = comp->tensors(); + auto comp = _services->component_factory().create(std::forward(args)...); + ArgumentPack tensors = comp->tensors(); const auto src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors()); - bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids); + bool success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids); ARM_COMPUTE_UNUSED(success); ARM_COMPUTE_ERROR_ON(!success); _components[comp->id()] = std::move(comp); - for(auto t : tensors.get_const_src_tensors()) + for (auto t : tensors.get_const_src_tensors()) { _tensors[t->id()] = t; } - for(auto t : tensors.get_const_dst_tensors()) + for (auto t : tensors.get_const_dst_tensors()) { _tensors[t->id()] = t; } @@ -99,11 +99,11 @@ public: private: static std::vector get_tensor_ids(const std::vector tensors); - GpuWorkloadContext *_context; - GpuComponentServices *_services; + GpuWorkloadContext *_context; + GpuComponentServices *_services; std::map> _components; std::map _tensors; - DependencyGraph _dependency_graph{}; + DependencyGraph _dependency_graph{}; }; } // namespace dynamic_fusion } // namespace experimental diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp index 81c3f0c800..5a6d125d96 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Validate.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include @@ -37,86 +38,87 @@ namespace dynamic_fusion { bool GpuKernelComponentGroup::add_component(ComponentPtr component) { - ARM_COMPUTE_ERROR_ON_MSG( - _finalized, "The component group has been finalized and cannot be altered."); + ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered."); // note: Constraint 1 is guaranteed as a precondition // Constraint 2 - if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) + if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components) { return false; } // Constraint 3.1: Pattern: (Unfusable + Output) - if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output) + if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && + component->type() != GpuComponentType::Output) { return false; } // Constraint 3.2 - if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output)) + if (!_components.empty() && + (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output)) { return false; } // Constraint 4 - if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) + if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U) { return false; } // Constraint 5 - if(!_components.empty() && !(get_root_component()->properties() == component->properties())) + if (!_components.empty() && !(get_root_component()->properties() == component->properties())) { return false; } // Constraint 7 - if(!_components.empty()) + if (!_components.empty()) { const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor = root_dst_tensors[0]; const auto dst_tensors = component->tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } } // Constraint 8 - if(!_components.empty()) + if (!_components.empty()) { const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); const auto dst_tensors = component->tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } } } // Constraint 9 - if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors) + if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors) { return false; } // Constraint 9 corollary - if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors) + if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors) { return false; } @@ -126,36 +128,36 @@ bool GpuKernelComponentGroup::add_component(ComponentPtr component) void GpuKernelComponentGroup::finalize() { - if(_finalized) + if (_finalized) { return; } _finalized = true; - std::set output_tensors; + std::set output_tensors; std::map> possible_tile_map; - std::map tile_usages; + std::map tile_usages; - for(auto component : _components) + for (auto component : _components) { - const auto tensors = component->tensors(); + const auto tensors = component->tensors(); const auto src_tensors = tensors.get_const_src_tensors(); const auto dst_tensors = tensors.get_const_dst_tensors(); // Detect input, output and intermediate tensors. - for(auto tensor : src_tensors) + for (auto tensor : src_tensors) { const auto output_tensors_it = output_tensors.find(tensor); - if(output_tensors_it != output_tensors.end()) + if (output_tensors_it != output_tensors.end()) { // This tensor is the output of another operator. // It must be marked as intermediate tensor. output_tensors.erase(output_tensors_it); _interm_tensors.insert(tensor); } - else if(_interm_tensors.find(tensor) == _interm_tensors.end()) + else if (_interm_tensors.find(tensor) == _interm_tensors.end()) { _input_tensors.insert(tensor); @@ -164,7 +166,7 @@ void GpuKernelComponentGroup::finalize() } } - for(auto tensor : dst_tensors) + for (auto tensor : dst_tensors) { ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end()); ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end()); @@ -177,27 +179,27 @@ void GpuKernelComponentGroup::finalize() // Check if the output can overwrite the input tile. const auto component_type = component->type(); - if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output) + if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output) { ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1); - const auto dst_tensor = dst_tensors[0]; - const auto &dst_shape = dst_tensor->tensor_shape(); - const auto &dst_type = dst_tensor->data_type(); + const auto dst_tensor = dst_tensors[0]; + const auto &dst_shape = dst_tensor->tensor_shape(); + const auto &dst_type = dst_tensor->data_type(); tile_usages[dst_tensor] = 0; - for(auto src_tensor : src_tensors) + for (auto src_tensor : src_tensors) { const auto &src_shape = src_tensor->tensor_shape(); - const auto &src_type = src_tensor->data_type(); + const auto &src_type = src_tensor->data_type(); - if(src_shape == dst_shape && src_type == dst_type) + if (src_shape == dst_shape && src_type == dst_type) { const auto tile_usages_it = tile_usages.find(src_tensor); ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end()); - if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0) + if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0) { // Increase the number of tile usages unless this component is an output // and the tile has not been shared with any component. @@ -212,7 +214,7 @@ void GpuKernelComponentGroup::finalize() else { // Outputs of complex and unfusable components need dedicated tile. - for(auto tensor : dst_tensors) + for (auto tensor : dst_tensors) { tile_usages[tensor] = 0; } @@ -220,25 +222,25 @@ void GpuKernelComponentGroup::finalize() } // Find the smallest list of tiles that the intermediate tensors need to write to. - for(auto tensor : _input_tensors) + for (auto tensor : _input_tensors) { _tile_map[tensor] = tensor; } - for(auto component : _components) + for (auto component : _components) { const auto dst_tensors = component->tensors().get_const_dst_tensors(); - for(auto tensor : dst_tensors) + for (auto tensor : dst_tensors) { const auto target_tiles = possible_tile_map.at(tensor); - _tile_map[tensor] = tensor; + _tile_map[tensor] = tensor; - for(auto target : target_tiles) + for (auto target : target_tiles) { const auto num_usage = tile_usages[target]; - if(num_usage <= 1) + if (num_usage <= 1) { // The target tile is consumed by only this operator, so we can reuse it // for the destination tensor data. @@ -249,26 +251,23 @@ void GpuKernelComponentGroup::finalize() } } - for(auto tensor : output_tensors) + for (auto tensor : output_tensors) { _tile_map[tensor] = tensor; } // All intermediate tensors that cannot be shared with any previous tensor // will need to be declared as tile variable. - for(auto tensor_tile : _tile_map) + for (auto tensor_tile : _tile_map) { - if(tensor_tile.first == tensor_tile.second && - _interm_tensors.find(tensor_tile.first) != _interm_tensors.end()) + if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end()) { _tiles.push_back(tensor_tile.first); } } - std::set_union( - _input_tensors.begin(), _input_tensors.end(), - output_tensors.begin(), output_tensors.end(), - std::back_inserter(_argument_tensors)); + std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(), + std::back_inserter(_argument_tensors)); _any_output_tensor = *output_tensors.begin(); } @@ -282,7 +281,7 @@ const ITensorInfo *GpuKernelComponentGroup::get_tile_for_tensor(const ITensorInf { ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized."); - if(_tile_map.find(tensor) != _tile_map.end()) + if (_tile_map.find(tensor) != _tile_map.end()) { return _tile_map.at(tensor); } @@ -304,7 +303,7 @@ std::vector GpuKernelComponentGroup::get_argument_tensors() GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const { - if(empty()) + if (empty()) { return nullptr; } diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h index c939aec369..6ad71abb39 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h @@ -25,12 +25,11 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP #include "components/Types.h" - #include #include -#include -#include #include +#include +#include namespace arm_compute { @@ -129,9 +128,9 @@ public: /** Get the number of components within the group */ size_t size() const; /** Check if the component group is empty */ - bool empty() const; - ComponentPtr &operator[](size_t index); - const ComponentPtr &operator[](size_t index) const; + bool empty() const; + ComponentPtr &operator[](size_t index); + const ComponentPtr &operator[](size_t index) const; typename std::vector::iterator begin(); typename std::vector::iterator end(); typename std::vector::const_iterator begin() const; @@ -142,13 +141,13 @@ public: private: std::vector _components{}; - bool _finalized{ false }; + bool _finalized{false}; - std::vector _argument_tensors{}; - std::set _input_tensors{}; - std::set _interm_tensors{}; - const ITensorInfo *_any_output_tensor{ nullptr }; - std::vector _tiles{}; + std::vector _argument_tensors{}; + std::set _input_tensors{}; + std::set _interm_tensors{}; + const ITensorInfo *_any_output_tensor{nullptr}; + std::vector _tiles{}; std::map _tile_map{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp index a2b6623370..8042e3dd08 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp @@ -23,9 +23,9 @@ */ #include "GpuKernelComponentStream.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute { @@ -33,8 +33,10 @@ namespace experimental { namespace dynamic_fusion { -GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map) - : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map } +GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, + GpuComponentServices *services, + const MemoryDescriptorMap &mem_map) + : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map} { } @@ -42,7 +44,7 @@ GpuWorkloadSourceCode GpuKernelComponentStream::write_workload_code() { GpuWorkloadSourceCode source_code; // Traverse through component groups and assemble workload together - for(auto && group : _component_groups) + for (auto &&group : _component_groups) { group.finalize(); diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h index ba2503a938..ef8a8a15b0 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" @@ -53,7 +54,9 @@ public: * @param[in] services @ref GpuComponentServices to be used throughout the stream * @param[in] mem_map @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode */ - GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map); + GpuKernelComponentStream(GpuWorkloadContext *context, + GpuComponentServices *services, + const MemoryDescriptorMap &mem_map); /** Allow instances of this class to be copy constructed */ GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default; /** Allow instances of this class to be copied */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h index 64e1cdc3bc..24812cd8a7 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h +++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp index c99984fc0e..502ceab807 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/experimental/Types.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h" #else // ACL_INTERNAL_TEST_CKW_IN_DF @@ -42,7 +42,7 @@ namespace experimental namespace dynamic_fusion { GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components) - : _comp_group{ components }, _store_components{} + : _comp_group{components}, _store_components{} { ARM_COMPUTE_UNUSED(services); } @@ -51,9 +51,9 @@ GpuKernelSourceCode GpuLogicalKernel::write_kernel_code() { GpuKernelSourceCode code; #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - ClTemplateWriter writer { _comp_group }; + ClTemplateWriter writer{_comp_group}; #else // ACL_INTERNAL_TEST_CKW_IN_DF - GpuCkwDriver writer { _comp_group }; + GpuCkwDriver writer{_comp_group}; #endif // ACL_INTERNAL_TEST_CKW_IN_DF code.name(writer.get_name()); diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp index 7bb14c8698..aec8b9db4f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp @@ -36,20 +36,15 @@ namespace std::vector get_tensor_ids(const std::vector tensors) { std::vector tensor_ids{}; - std::transform( - std::begin(tensors), std::end(tensors), - std::back_inserter(tensor_ids), - [](const auto & t) - { - return t->id(); - }); + std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids), + [](const auto &t) { return t->id(); }); return tensor_ids; } } // namespace Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack &tensors) - : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors } + : _id{id}, _operator_type{operator_type}, _tensors{tensors} { } @@ -73,69 +68,69 @@ bool GpuOperatorGroup::try_add_operator(const Operator &op, bool is_output) cons const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors()); const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors()); // Constraint 1 - if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) + if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output)) { return false; } // Constraint 2 - if(_operators.size() >= max_fused_operators) + if (_operators.size() >= max_fused_operators) { return false; } // Constraint 3.1: Pattern: (Unfusable) - if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable) + if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable) { return false; } // Constraint 3.2 - if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple)) + if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple)) { return false; } // Constraint 4 - if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U) + if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U) { return false; } // Constraint 5 - if(_operators.size() > 0) + if (_operators.size() > 0) { const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor = root_dst_tensors[0]; const auto dst_tensors = op.tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) + if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0)) { return false; } } } // Constraint 6 - if(_operators.size() > 0) + if (_operators.size() > 0) { const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors(); ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty()); const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout(); const auto dst_tensors = op.tensors().get_const_dst_tensors(); - for(const auto &t : root_dst_tensors) + for (const auto &t : root_dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } } - for(const auto &t : dst_tensors) + for (const auto &t : dst_tensors) { - if(t->data_layout() != first_dst_tensor_layout) + if (t->data_layout() != first_dst_tensor_layout) { return false; } @@ -151,16 +146,17 @@ void GpuOperatorGroup::add_operator(const Operator &op, bool is_output) _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output); _operators[op.id()] = op; } -Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack &tensors) const +Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, + const ArgumentPack &tensors) const { auto new_id = static_cast(_operators.size()); - return Operator{ new_id, operator_type, tensors }; + return Operator{new_id, operator_type, tensors}; } const Operator *GpuOperatorGroup::get_root_operator() const { const auto roots = _graph.get_root_ops(); ARM_COMPUTE_ERROR_ON(roots.size() > 1); - if(roots.empty()) + if (roots.empty()) { return nullptr; } diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h index 308a9d796a..0a2369d357 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h +++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h @@ -25,9 +25,11 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" #include "src/dynamic_fusion/sketch/utils/DependencyGraph.h" + #include namespace arm_compute @@ -104,7 +106,7 @@ public: const Operator *get_root_operator() const; private: - DependencyGraph _graph{}; + DependencyGraph _graph{}; std::map _operators{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp index c2bd012703..36cad790c7 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp @@ -23,7 +23,9 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" + #include "arm_compute/core/CL/CLCompileContext.h" + #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" namespace arm_compute @@ -33,7 +35,7 @@ namespace experimental namespace dynamic_fusion { GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx) - : _impl{ std::make_unique(GpuLanguage::OpenCL, cl_compile_ctx) } + : _impl{std::make_unique(GpuLanguage::OpenCL, cl_compile_ctx)} { } @@ -74,7 +76,11 @@ const GpuWorkloadContext::Impl &GpuWorkloadContext::implementation() const } GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx) - : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info() + : _gpu_language(gpu_language), + _cl_compile_ctx(cl_compile_ctx), + _next_tensor_id(1), + _mem_map(), + _managed_tensor_info() { } @@ -100,7 +106,7 @@ void GpuWorkloadContext::Impl::register_user_tensor(ITensorInfo &tensor_info) const auto tensor_id = next_tensor_id(); tensor_info.set_id(tensor_id); - _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User }; + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User}; // Save a *copy* of the user tensor info in workload context for future reference // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context _managed_tensor_info.emplace(tensor_info.id(), std::make_unique(tensor_info)); @@ -111,7 +117,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_virtual_tensor() auto tensor_info = std::make_unique(); const auto tensor_id = -next_tensor_id(); tensor_info->set_id(tensor_id); - _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual }; + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual}; auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); return inserted.first->second.get(); } @@ -121,7 +127,7 @@ ITensorInfo *GpuWorkloadContext::Impl::create_auxiliary_tensor(const ITensorInfo auto tensor_info = std::make_unique(itensor_info); const auto tensor_id = next_tensor_id(); tensor_info->set_id(tensor_id); - _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } }; + _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}}; auto inserted = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info)); return inserted.first->second.get(); } diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h index c169476a70..7d9699031f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h @@ -27,8 +27,8 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" namespace arm_compute { @@ -93,8 +93,8 @@ private: GpuLanguage _gpu_language; CLCompileContext *_cl_compile_ctx; - ITensorInfo::Id _next_tensor_id; - MemoryDescriptorMap _mem_map; + ITensorInfo::Id _next_tensor_id; + MemoryDescriptorMap _mem_map; std::map> _managed_tensor_info; }; diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp index d3a20c0dfe..973f7c747f 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute @@ -30,8 +31,7 @@ namespace experimental { namespace dynamic_fusion { -GpuWorkloadSketch::GpuWorkloadSketch(Context *context) - : _impl{ std::make_unique(context) } +GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique(context)} { } GpuWorkloadSketch::~GpuWorkloadSketch() diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h index d3033898e9..fea4fe9577 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h @@ -24,8 +24,9 @@ #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL -#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h" #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h" @@ -45,12 +46,8 @@ public: * * @param[in] context global workload creation context */ - explicit Implementation( - Context *context) - : _context{ context }, - _comp_services{}, - _component_graph{ _context, &_comp_services }, - _operator_group{} + explicit Implementation(Context *context) + : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{} { } /** Prevent instances of this class from being copy constructed */ diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h index 578366daaf..43bcc47fa0 100644 --- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h +++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h" @@ -45,7 +46,7 @@ namespace */ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args) { - if(flat_kernel_args.empty()) + if (flat_kernel_args.empty()) { return {}; } @@ -56,10 +57,10 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList & flat_kernel_args.pop_front(); const auto tensor_id = karg_head.id(); - while(!flat_kernel_args.empty()) + while (!flat_kernel_args.empty()) { const GpuKernelArgumentBinding &karg = flat_kernel_args.front(); - if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments + if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments { return tensor_kargs; } @@ -68,7 +69,7 @@ GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList & } return tensor_kargs; } -} +} // namespace #endif // ACL_INTERNAL_TEST_CKW_IN_DF /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */ using UnitWorkloadId = int32_t; @@ -92,9 +93,7 @@ public: GpuWorkloadArgument(const ITensorInfo &tensor_info, const MemoryDescriptor &mem_desc, const GpuKernelArgumentInfo &kernel_arg_info) - : _tensor_info{ tensor_info }, - _mem_desc{ mem_desc }, - _kernel_arg_info{ kernel_arg_info } + : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info} { } #else // ACL_INTERNAL_TEST_CKW_IN_DF @@ -107,9 +106,7 @@ public: GpuWorkloadArgument(const ITensorInfo &tensor_info, const MemoryDescriptor &mem_desc, const GpuKernelArgumentList &kernel_args) - : _tensor_info{ tensor_info }, - _mem_desc{ mem_desc }, - _kernel_args{ kernel_args } + : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args} { } #endif // ACL_INTERNAL_TEST_CKW_IN_DF @@ -175,9 +172,9 @@ private: TensorInfo _tensor_info{}; MemoryDescriptor _mem_desc{}; #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - GpuKernelArgumentInfo _kernel_arg_info {}; + GpuKernelArgumentInfo _kernel_arg_info{}; #else // ACL_INTERNAL_TEST_CKW_IN_DF - GpuKernelArgumentList _kernel_args {}; + GpuKernelArgumentList _kernel_args{}; #endif // ACL_INTERNAL_TEST_CKW_IN_DF }; @@ -190,7 +187,7 @@ struct UnitWorkloadStage Prepare, /**< Only run once at the beginning. */ Run, /**< Run every time after the first time. */ }; - Stage stage{ Stage::Run }; + Stage stage{Stage::Run}; }; inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1) @@ -212,7 +209,7 @@ public: * @param[in] stage Stage of the unit workload */ GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage) - : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage } + : _id{id}, _kernel_code{kernel_code}, _stage{stage} { } /** Get the id of the unit workload */ @@ -253,7 +250,10 @@ public: * * @return UnitWorkloadId Allocated unit workload id */ - UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context) + UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, + const UnitWorkloadStage &stage, + const MemoryDescriptorMap &mem_map, + const GpuWorkloadContext *context) { // Use the size of the kernel codes as Id const auto uwk_id = static_cast(_unit_workloads.size()); @@ -262,12 +262,13 @@ public: #ifndef ACL_INTERNAL_TEST_CKW_IN_DF ARM_COMPUTE_UNUSED(context); // Assemble kernel argument with memory descriptor to form workload argument - for(const auto &id_arg : kernel_code.arguments()) + for (const auto &id_arg : kernel_code.arguments()) { - const auto arg_id = id_arg.first; - const auto arg = id_arg.second; - _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() }; - if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end()) + const auto arg_id = id_arg.first; + const auto arg = id_arg.second; + _workload_arguments[arg_id] = + GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()}; + if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end()) { _tensor_uwork_map[arg_id] = std::set(); } @@ -276,18 +277,19 @@ public: #else // ACL_INTERNAL_TEST_CKW_IN_DF GpuKernelArgumentList flat_kernel_args = kernel_code.arguments(); GpuKernelArgumentList tensor_kargs{}; - while(true) + while (true) { tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args); - if(tensor_kargs.empty()) + if (tensor_kargs.empty()) { break; } else { const auto tensor_id = tensor_kargs.at(0).id(); - _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs }; - if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end()) + _workload_arguments[tensor_id] = GpuWorkloadArgument{ + *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs}; + if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end()) { _tensor_uwork_map[tensor_id] = std::set(); } @@ -308,7 +310,7 @@ public: { std::vector ids{}; - for(const auto &uwk : _unit_workloads) + for (const auto &uwk : _unit_workloads) { ids.push_back(uwk.id()); } @@ -323,7 +325,7 @@ public: std::vector tensors() const { std::vector ids{}; - for(const auto &id_tensor : _workload_arguments) + for (const auto &id_tensor : _workload_arguments) { ids.push_back(id_tensor.first); } @@ -337,7 +339,7 @@ public: } private: - std::vector _unit_workloads{}; + std::vector _unit_workloads{}; std::map _workload_arguments{}; std::map> _tensor_uwork_map{}; }; diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h index 1d8b231efd..ad474674f9 100644 --- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h +++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h" diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp index 4b4c22fa1d..c4ab110c92 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp @@ -23,6 +23,7 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + #include "ckw/Error.h" namespace arm_compute @@ -36,12 +37,12 @@ GpuCkwComponentArgument::GpuCkwComponentArgument() { } -GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) - : _tensor(&tensor) +GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor) { } -GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler) +GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, + const ckw::TensorTileSampler &tile_sampler) { CKW_ASSERT(_tile == nullptr); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h index 80f91389a0..863989a7bd 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h @@ -110,9 +110,9 @@ public: const ckw::TensorTileSampler &tile_sampler() const; private: - ckw::TensorOperand *_tensor{ nullptr }; - ckw::TileOperand *_tile{ nullptr }; - ckw::TensorTileSampler _tile_sampler{}; + ckw::TensorOperand *_tensor{nullptr}; + ckw::TileOperand *_tile{nullptr}; + ckw::TensorTileSampler _tile_sampler{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp index a24a172d77..c927f32bde 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp @@ -23,17 +23,16 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Window.h" + #include "src/common/utils/Log.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" - #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" using namespace ckw; namespace arm_compute @@ -43,11 +42,11 @@ namespace experimental namespace dynamic_fusion { GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) - : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }, _code{} + : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{} { // Generate kernel name std::string name = ""; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -60,7 +59,7 @@ GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) GpuCkwScopedKernelWriter writer(&root_writer); GpuCkwVariableTable vtable{}; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -82,7 +81,7 @@ std::string GpuCkwDriver::get_code() std::string GpuCkwDriver::get_config_id() { std::string id = ""; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -101,9 +100,9 @@ Window GpuCkwDriver::get_window() const GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments() { GpuKernelArgumentList args{}; - for(const auto &arg : _kernel.arguments()) + for (const auto &arg : _kernel.arguments()) { - switch(arg.type()) + switch (arg.type()) { case KernelArgument::Type::TensorStorage: { diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h index 19db575fea..2ca5fb435c 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER +#include "ckw/Kernel.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h" -#include "ckw/Kernel.h" - #include #include diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp index ca4f121566..5f8ce919e3 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp @@ -23,10 +23,12 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + #include "ckw/Error.h" #include "ckw/TileInfo.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + namespace arm_compute { namespace experimental @@ -34,21 +36,21 @@ namespace experimental namespace dynamic_fusion { -GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) - : KernelWriter(kernel) +GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel) { } void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler) { - if(!tensor_or_tile->has_tile()) + if (!tensor_or_tile->has_tile()) { CKW_ASSERT(tensor_or_tile->has_tensor()); auto &tensor = tensor_or_tile->tensor(); const auto tile_name = tensor.name() + "_tile"; - auto &tile = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width())); + auto &tile = + declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width())); op_load(tile, tensor, sampler); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp index 043fda9e6f..cbadbd9639 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp @@ -23,6 +23,7 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h index 4d11b5e3e4..81049bfe37 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h @@ -63,7 +63,7 @@ public: private: GpuCkwKernelWriter *_writer; - int32_t _parent_id_space; + int32_t _parent_id_space; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp index 37c27cd116..88a0cf7f43 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp @@ -23,11 +23,12 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include namespace arm_compute @@ -36,19 +37,22 @@ namespace experimental { namespace dynamic_fusion { -GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage, - const std::string &alias) +GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + TensorStorageType storage, + const std::string &alias) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); // Do not re-declare if the variable associated with the tensor has already been declared auto it = _vars.find(tensor->id()); - if(it != _vars.end()) + if (it != _vars.end()) { return &it->second; } - if(comp_group.is_intermediate_tensor(tensor)) + if (comp_group.is_intermediate_tensor(tensor)) { // Create a virtual tensor variable GpuCkwComponentArgument var; @@ -61,7 +65,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo std::stringstream ss; ss << alias << "_t" << abs(tensor->id()); const auto uniq_name = ss.str(); - GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) }; + GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))}; auto &&inserted = _vars.emplace(tensor->id(), var); return &(inserted.first->second); } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h index 0649dcba9d..2b118911b8 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" #include @@ -58,8 +59,11 @@ public: * * @return GpuCkwComponentArgument* */ - GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage, - const std::string &alias = "unnamed"); + GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + TensorStorageType storage, + const std::string &alias = "unnamed"); private: std::map _vars{}; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h index 14086f785e..52e56e2e35 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/components/Types.h" @@ -73,8 +74,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - IGpuCkwComponentDriver(ComponentId id, const ArgumentPack &tensors) - : _id{ id }, _tensors{ tensors } + IGpuCkwComponentDriver(ComponentId id, const ArgumentPack &tensors) : _id{id}, _tensors{tensors} { } /** Destructor */ @@ -89,7 +89,9 @@ public: * * @note @p writer can only be passed via value since the new scope is created in the copy constructor */ - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const = 0; /** Get tensor arguments */ ArgumentPack tensors() const { @@ -128,7 +130,7 @@ public: } private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; ArgumentPack _tensors{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp index c07fac0e0d..c3b1b3c8bc 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp @@ -24,16 +24,18 @@ #include "GpuCkwActivation.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include using namespace ckw; @@ -87,24 +89,25 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ GpuCkwActivation::GpuCkwActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler); @@ -119,7 +122,7 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp const auto &constant_B = writer->declare_tile("B_VAL", _attributes.b()); // Perform the operation. - switch(_attributes.activation()) + switch (_attributes.activation()) { case ActivationLayerInfo::ActivationFunction::LOGISTIC: { @@ -179,9 +182,10 @@ Window GpuCkwActivation::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h index e157e36cbf..386e933a72 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h @@ -46,15 +46,15 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwActivation(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation); /** Destructor */ ~GpuCkwActivation() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp index 6ecf2bac44..e8e5087633 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp @@ -24,16 +24,18 @@ #include "GpuCkwCast.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include using namespace ckw; @@ -84,30 +86,29 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ } } // namespace -GpuCkwCast::GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } +GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Load the source tile and prepare the sampler. - if(!src->has_tile()) + if (!src->has_tile()) { const auto sampler = create_sampler(writer, m0, n0); writer->op_load_once(src, sampler); @@ -122,7 +123,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const auto &sampler = src->tile_sampler(); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { // Get Target datatype and convert it to ckw::DataType. ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type()); @@ -143,7 +144,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const size_t dst_size = data_size_from_type(_dst->data_type()); const bool cast_down = (src_size >= dst_size); - if(cast_down && is_data_type_quantized(_src->data_type())) + if (cast_down && is_data_type_quantized(_src->data_type())) { const auto &constant_x80 = writer->declare_tile("0x80", 0x80); writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80); @@ -151,7 +152,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None; - if(cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) + if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) { convert_policy = ckw::ConvertPolicy::Saturate; } @@ -167,9 +168,10 @@ Window GpuCkwCast::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h index 821cec1e19..2389301196 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h @@ -46,15 +46,15 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast); /** Destructor */ ~GpuCkwCast() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp index 3c906646a6..7833da2334 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp @@ -25,21 +25,20 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" - +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" #include "ckw/TileInfo.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" namespace arm_compute { @@ -54,13 +53,7 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _wei{}, - _bia{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -69,7 +62,9 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null } -void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto desc = _settings.direct_conv_descriptor(); ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image, @@ -99,15 +94,18 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // extra loop to compute the left-over elements. const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); GpuCkwComponentArgument *wei = vtable.declare_variable( - comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + comp_group, writer, _wei, + use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); GpuCkwComponentArgument *bia = nullptr; const bool using_bias = _bia != nullptr; - if(using_bias) + if (using_bias) { bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia"); } @@ -154,7 +152,8 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, src_sampler.address_mode_x(TensorSamplerAddressModeX::None); // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the // indirection buffer mi does not contain negative values representing out-of-bounds reads. - src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly); + src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None + : TensorSamplerAddressModeY::SkipMinEdgeOnly); src_sampler.address_mode_z(TensorSamplerAddressModeZ::None); TensorTileSampler wei_sampler; @@ -178,7 +177,7 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, dst_sampler.z(tile_0); dst_sampler.b(tile_bout); - if(!dst->has_tile()) + if (!dst->has_tile()) { auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0)); dst->init_virtual_tensor(tile, dst_sampler); @@ -189,10 +188,10 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // We create a 2d container of size (M0, 1) to store the indices for iteration TileContainer it; - for(int m = 0; m < m0; ++m) + for (int m = 0; m < m0; ++m) { - std::vector idx { std::to_string(m) }; - it.push_back({ idx }); + std::vector idx{std::to_string(m)}; + it.push_back({idx}); } const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32); @@ -289,9 +288,9 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // Bias addition // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. - if(using_bias) + if (using_bias) { - if(!bia->has_tile()) + if (!bia->has_tile()) { // Reuse the destination sampler for the bias writer->op_load_once(bia, dst_sampler); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp index c8bf999261..2935ba45ea 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp @@ -24,22 +24,24 @@ #include "GpuCkwElementwiseBinary.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" #include "ckw/types/TensorSamplerTypes.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" + #include #include @@ -53,11 +55,7 @@ namespace dynamic_fusion GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _lhs{}, - _rhs{}, - _dst{}, - _attributes{ attributes } + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -65,15 +63,20 @@ GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); } -void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const auto n0 = static_cast(root_window.x().step()); const auto m0 = static_cast(root_window.y().step()); - GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); - GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *lhs = + vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); + GpuCkwComponentArgument *rhs = + vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -86,32 +89,36 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_gr auto &const_0 = writer->declare_tile("0", 0); // Load the LHS and RHS tiles - if(!lhs->has_tile()) + if (!lhs->has_tile()) { - auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), n0, m0, "lhs_", const_0); + auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), + n0, m0, "lhs_", const_0); sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension sampler.z(const_0); sampler.b(gid_2); writer->op_load_once(lhs, sampler); } - if(!rhs->has_tile()) + if (!rhs->has_tile()) { - auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), n0, m0, "rhs_", const_0); + auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), + n0, m0, "rhs_", const_0); sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension sampler.z(const_0); sampler.b(gid_2); writer->op_load_once(rhs, sampler); } - auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), n0, m0, "dst_", const_0); + auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), + n0, m0, "dst_", const_0); dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension dst_sampler.z(const_0); dst_sampler.b(gid_2); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { - auto &tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width())); + auto &tile = writer->declare_tile( + "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width())); dst->init_virtual_tensor(tile, dst_sampler); } @@ -131,9 +138,10 @@ Window GpuCkwElementwiseBinary::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } @@ -141,11 +149,12 @@ Window GpuCkwElementwiseBinary::get_window() const std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - const std::vector build_params = - { + const std::vector build_params = { "elementwise_binary", - "op", to_string(_attributes.operation()), - "dt", lower_string(string_from_data_type(_dst->data_type())), + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), }; return join(build_params, "_"); } @@ -154,13 +163,16 @@ std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_gro { ARM_COMPUTE_UNUSED(comp_group); /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles) - std::vector build_params = - { + std::vector build_params = { "elementwise_binary", - "op", to_string(_attributes.operation()), - "dt", lower_string(string_from_data_type(_dst->data_type())), - "dst_dim0", support::cpp11::to_string(_dst->dimension(0)), - "dst_dim1", support::cpp11::to_string(_dst->dimension(1)), + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), + "dst_dim0", + support::cpp11::to_string(_dst->dimension(0)), + "dst_dim1", + support::cpp11::to_string(_dst->dimension(1)), }; return join(build_params, "_"); } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h index e9c41530f8..1a20d4c533 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h @@ -46,17 +46,17 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwElementwiseBinary(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary); /** Destructor */ ~GpuCkwElementwiseBinary() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; - std::string get_name(const ComponentGroup &comp_group) const override; - std::string get_tuner_id(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + std::string get_tuner_id(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_lhs; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp index 9c9a298132..8ab3ec3a55 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp @@ -24,17 +24,18 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" using namespace ckw; @@ -48,11 +49,7 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); @@ -60,14 +57,18 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -90,23 +91,26 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const auto src_data_type = _src->data_type(); // Check if this is global pooling path - const bool is_global_pooling = (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); + const bool is_global_pooling = + (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); // Check if this a case of FP_MIXED_PRECISION - const bool use_fp_mixed_precision = (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); + const bool use_fp_mixed_precision = + (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; + const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); TileOperand &const_0 = writer->declare_tile("0", 0); const TileOperand &const_1 = writer->declare_tile("1", 1); const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits::lowest()); const TileOperand &pool_size_x_tile = writer->declare_tile("POOL_SIZE_X", pool_size_x); const TileOperand &pool_size_y_tile = writer->declare_tile("POOL_SIZE_Y", pool_size_y); - const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast(_attributes.stride().x())); - const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast(_attributes.stride().y())); - const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); - const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); - const TileOperand &dst_height_tile = writer->declare_tile("DST_HEIGHT", static_cast(_dst->dimension(height_idx))); - const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); - const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); + const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast(_attributes.stride().x())); + const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast(_attributes.stride().y())); + const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); + const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); + const TileOperand &dst_height_tile = + writer->declare_tile("DST_HEIGHT", static_cast(_dst->dimension(height_idx))); + const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); + const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32); TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32); @@ -145,7 +149,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw // Prepare dst tensor and tile TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0); - if(!dst->has_tile()) + if (!dst->has_tile()) { TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info); dst->init_virtual_tensor(dst_tile, dst_sampler); @@ -156,14 +160,15 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); // Initialise result tile with appropriate value - if(_attributes.pool_type() == PoolingType::MAX) + if (_attributes.pool_type() == PoolingType::MAX) { - if(_settings.use_inf_as_limit()) + if (_settings.use_inf_as_limit()) { TileContainer minus_inf_tile_container; std::vector value = std::vector(n0, "(-INFINITY)"); - minus_inf_tile_container.push_back({ value }); - const TileOperand &minus_inf = writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); + minus_inf_tile_container.push_back({value}); + const TileOperand &minus_inf = + writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); writer->op_assign(res_tile, minus_inf); } else @@ -209,7 +214,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e); const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32); - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32); const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32); @@ -227,7 +232,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32); const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32); - if(is_global_pooling) + if (is_global_pooling) { writer->op_assign(x, const_0); writer->op_assign(y, const_0); @@ -242,76 +247,80 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw } // Y dim for-loop - writer->op_for_loop(y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, [&]() - { - // Reset the iterator for the inner loop - if(is_global_pooling) - { - writer->op_assign(x, const_0); - } - else + writer->op_for_loop( + y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, + [&]() { - writer->op_assign(x, pool_x_s); - } - - TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); - writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); - - // X dim for-loop - writer->op_for_loop(x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, [&]() - { - TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); - writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); - - TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); - - src_sampler.y(a_x); - src_sampler.z(a_y); - - // Load src tile - if(use_fp_mixed_precision) + // Reset the iterator for the inner loop + if (is_global_pooling) { - TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); - writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); - writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + writer->op_assign(x, const_0); } else { - writer->op_load(src_tile, src->tensor(), src_sampler); + writer->op_assign(x, pool_x_s); } - // Take the square of the input, for L2 Pooling - if(_attributes.pool_type() == PoolingType::L2) - { - writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); - } - - // Perfom Pooling op - if(_attributes.pool_type() == PoolingType::MAX) - { - writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); - } - else - { - writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); - } + TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); + writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); + + // X dim for-loop + writer->op_for_loop( + x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, + [&]() + { + TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); + writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); + + TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); + + src_sampler.y(a_x); + src_sampler.z(a_y); + + // Load src tile + if (use_fp_mixed_precision) + { + TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); + writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); + writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + } + else + { + writer->op_load(src_tile, src->tensor(), src_sampler); + } + + // Take the square of the input, for L2 Pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); + } + + // Perfom Pooling op + if (_attributes.pool_type() == PoolingType::MAX) + { + writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); + } + else + { + writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); + } + }); }); - }); - if((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) + if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) { // filter_size is automatically broadcasted in the operation writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size); } // Take square root of the result in L2 pooling - if(_attributes.pool_type() == PoolingType::L2) + if (_attributes.pool_type() == PoolingType::L2) { writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile); } // Store the results and do casting if FP_MIXED_PRECISION - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None); } @@ -326,7 +335,7 @@ Window GpuCkwPool2d::get_window() const ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); TensorShape output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size. diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h index 2ccf255236..822282a108 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h @@ -59,9 +59,11 @@ public: /** Destructor */ ~GpuCkwPool2d() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; - std::string get_name(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp index d997c82dae..f2a7d41afd 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp @@ -28,14 +28,13 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/ScaleUtils.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" - +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -49,20 +48,17 @@ namespace constexpr unsigned int opencl_vector_size_in_bytes = 16; } // namespace -GpuCkwResize::GpuCkwResize(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } +GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); @@ -72,12 +68,16 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, const int32_t m0 = root_window.y().step(); const int32_t partial_n0 = _dst->dimension(0) % n0; - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x); const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y); const auto &tile_0 = writer->declare_tile("0", 0); @@ -112,7 +112,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); - switch(_attributes.sampling_policy()) + switch (_attributes.sampling_policy()) { case SamplingPolicy::TOP_LEFT: // xi_f = (xo * scale_x) @@ -138,7 +138,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, ARM_COMPUTE_ERROR("Unsupported sampling policy"); } - if(_attributes.align_corners()) + if (_attributes.align_corners()) { writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f); writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f); @@ -161,8 +161,10 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); - writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, tile_src_w_minus_1); - writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, tile_src_h_minus_1); + writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, + tile_src_w_minus_1); + writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, + tile_src_h_minus_1); TensorTileSampler src_sampler; src_sampler.x(tile_co); @@ -199,7 +201,9 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, writer->op_assign(tile_dst, tile_src); } -void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); @@ -209,12 +213,16 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa const int32_t m0 = root_window.y().step(); const int32_t partial_n0 = _dst->dimension(0) % n0; - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x); const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y); const auto &tile_0 = writer->declare_tile("0", 0); @@ -251,7 +259,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); - switch(_attributes.sampling_policy()) + switch (_attributes.sampling_policy()) { case SamplingPolicy::TOP_LEFT: // xi_f = (xo * scale_x) @@ -312,8 +320,10 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1); writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1); - writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, tile_src_w_minus_1); - writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, tile_src_h_minus_1); + writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, + tile_src_w_minus_1); + writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, + tile_src_h_minus_1); TensorTileSampler in_sampler; in_sampler.x(tile_co); @@ -388,7 +398,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float); writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1); - if(is_data_type_float(_src->data_type())) + if (is_data_type_float(_src->data_type())) { // Cast weights to source type const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type())); @@ -461,9 +471,11 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa } } -void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { - switch(_attributes.interpolation_policy()) + switch (_attributes.interpolation_policy()) { case InterpolationPolicy::NEAREST_NEIGHBOR: do_nearest_neighbor_resize(comp_group, vtable, writer); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp index 8917391537..889706b0c0 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp @@ -24,10 +24,12 @@ #include "GpuCkwStore.h" #include "arm_compute/core/Error.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + #include namespace arm_compute @@ -37,12 +39,14 @@ namespace experimental namespace dynamic_fusion { GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack &tensors) - : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{} + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); } -void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h index 8e35651caf..f1f0e6747b 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h @@ -48,8 +48,10 @@ public: /** Destructor */ ~GpuCkwStore() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - std::string get_name(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + std::string get_name(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h index e2b8584b99..6ba2b2f651 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/Utility.h" #include "ckw/TensorTileSampler.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" @@ -44,9 +45,14 @@ using SamplerCreator = std::functionhas_tile()) + if (!src->has_tile()) { const auto sampler = create_sampler(writer, m0, n0); writer->op_load_once(src, sampler); @@ -61,7 +67,7 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri const auto &sampler = src->tile_sampler(); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info()); dst->init_virtual_tensor(tile, sampler); @@ -78,7 +84,13 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri * @param[in] prefix Prefix to all the tiles declared within this function * @param[in] const_0 Constant tile of value 0 */ -inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const TileOperand &gid, int32_t step_v, int32_t leftover_step_v, const std::string &prefix, const TileOperand &const_0) +inline void get_coord(GpuCkwScopedKernelWriter writer, + TileOperand &coord, + const TileOperand &gid, + int32_t step_v, + int32_t leftover_step_v, + const std::string &prefix, + const TileOperand &const_0) { auto &step = writer->declare_tile(prefix + "step", step_v); auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v); @@ -122,8 +134,15 @@ inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const * * @return TensorTileSampler */ -inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, TileOperand &gid_0, TileOperand &gid_1, int32_t dim0_v, int32_t dim1_v, int32_t n0_v, int32_t m0_v, - const std::string prefix, TileOperand &const_0) +inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, + TileOperand &gid_0, + TileOperand &gid_1, + int32_t dim0_v, + int32_t dim1_v, + int32_t n0_v, + int32_t m0_v, + const std::string prefix, + TileOperand &const_0) { // Clamp tile size [n0, m0] against dimension [dim0, dim1] // This is needed to: diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h index 34b1283add..5da317bf38 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" #include "ckw/TensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" namespace arm_compute @@ -38,7 +39,7 @@ namespace dynamic_fusion { inline ckw::DataType to_ckw(DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: return ckw::DataType::Fp32; @@ -65,21 +66,16 @@ inline ckw::DataType to_ckw(DataType dt) inline ckw::TensorShape to_ckw(const TensorShape &shape) { - ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size {}); - ARM_COMPUTE_ERROR_ON(std::tuple_size {} != 5); + ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size{}); + ARM_COMPUTE_ERROR_ON(std::tuple_size{} != 5); /// NOTE: Overflow danger. Use size_t? - return ckw::TensorShape - { - static_cast(shape[0]), - static_cast(shape[1]), - static_cast(shape[2]), - static_cast(shape[3]), - static_cast(shape[4]) - }; + return ckw::TensorShape{static_cast(shape[0]), static_cast(shape[1]), + static_cast(shape[2]), static_cast(shape[3]), + static_cast(shape[4])}; } inline ckw::TensorDataLayout to_ckw(DataLayout dl) { - switch(dl) + switch (dl) { case DataLayout::NHWC: return ckw::TensorDataLayout::Nhwc; @@ -91,18 +87,13 @@ inline ckw::TensorDataLayout to_ckw(DataLayout dl) } inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info) { - return ckw::TensorInfo - { - to_ckw(tensor_info.data_type()), - to_ckw(tensor_info.tensor_shape()), - to_ckw(tensor_info.data_layout()), - tensor_info.id() - }; + return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()), + to_ckw(tensor_info.data_layout()), tensor_info.id()}; } inline TensorComponentType from_ckw(const ckw::TensorComponentType &component) { - switch(component) + switch (component) { case ckw::TensorComponentType::OffsetFirstElement: return TensorComponentType::OffsetFirstElement; @@ -142,7 +133,7 @@ inline TensorComponentType from_ckw(const ckw::TensorComponentType &component) inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage) { - switch(storage) + switch (storage) { case TensorStorageType::ClBufferUint8Ptr: return ckw::TensorStorageType::BufferUint8Ptr; @@ -159,7 +150,7 @@ inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage) } inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage) { - switch(storage) + switch (storage) { case ckw::TensorStorageType::BufferUint8Ptr: return TensorStorageType::ClBufferUint8Ptr; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h index 9cb022fc10..0cba258940 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY #include "ckw/types/Operators.h" + #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" namespace arm_compute @@ -35,7 +36,7 @@ namespace dynamic_fusion { inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes) { - switch(attributes.operation()) + switch (attributes.operation()) { case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add: return ckw::BinaryOp::Add; diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h index f7f0029618..ee109a7e2b 100644 --- a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h +++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h @@ -24,8 +24,9 @@ #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY -#include "Types.h" #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + +#include "Types.h" #include namespace arm_compute @@ -49,13 +50,13 @@ public: * @return std::unique_ptr */ template - std::unique_ptr create(Args &&... args) + std::unique_ptr create(Args &&...args) { return std::make_unique(_count++, std::forward(args)...); } private: - ComponentId _count{ 0 }; + ComponentId _count{0}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h index af766a7ece..4b8eea2f57 100644 --- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h +++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h @@ -24,11 +24,11 @@ #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT -#include "Types.h" - #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" +#include "Types.h" + namespace arm_compute { namespace experimental @@ -76,13 +76,8 @@ public: * @param[in] properties Kernel component properties * @param[in] tensors Tensor arguments to the components */ - IGpuKernelComponent( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors) - : _id{ id }, - _properties{ properties }, - _tensors{ tensors } + IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack &tensors) + : _id{id}, _properties{properties}, _tensors{tensors} { } /** Destructor */ @@ -117,7 +112,7 @@ public: virtual GpuComponentType type() const = 0; private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; Properties _properties{}; ArgumentPack _tensors{}; }; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp index c41257d18c..fdf528a65d 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp @@ -68,17 +68,11 @@ ClComponentActivation::ClComponentActivation(ComponentId const IGpuKernelComponent::Properties &properties, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h index 9b090af988..02c854356a 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h @@ -25,9 +25,8 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION #include "arm_compute/function_info/ActivationLayerInfo.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" -#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute { @@ -79,20 +78,17 @@ public: * |F16 |F16 | * |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * * Similar to @ref ClComponentActivation::validate() */ - ClComponentActivation( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + ClComponentActivation(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes); /** Destructor */ ~ClComponentActivation() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp index 635869f817..b1636795a3 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp @@ -24,6 +24,7 @@ #include "ClComponentCast.h" #include "arm_compute/core/Error.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF @@ -38,11 +39,10 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentCast::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentCast::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties, attributes, settings); @@ -53,13 +53,15 @@ Status ClComponentCast::validate( ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), "input and target data types should be different"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), + "input and target data types should be different"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), "dst and target data types should be same"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), + "dst and target data types should be same"); } return Status{}; @@ -69,17 +71,11 @@ ClComponentCast::ClComponentCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { ARM_COMPUTE_UNUSED(attributes, settings); diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h index 37b8cbb6c9..ed77b1203b 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -93,11 +94,10 @@ public: * |F16 | U8, S8, U16, S16, U32, S32, F32 | * |F32 | U8, S8, U16, S16, U32, S32, F16 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp index 5626093079..d95e0be1f2 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h" @@ -103,11 +104,10 @@ unsigned int Settings::m0() const return _m0; } -Status ClComponentDepthwiseConv2d::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentDepthwiseConv2d::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties, settings); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -121,7 +121,7 @@ Status ClComponentDepthwiseConv2d::validate( // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); } @@ -129,7 +129,7 @@ Status ClComponentDepthwiseConv2d::validate( // Matching data layout ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); } @@ -138,7 +138,7 @@ Status ClComponentDepthwiseConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); } @@ -148,16 +148,17 @@ Status ClComponentDepthwiseConv2d::validate( const DataLayout data_layout = src->data_layout(); const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier())); + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != + (src->dimension(channel_idx) * attributes.depth_multiplier())); ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional"); // dst shape is correct - const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(), - attributes.pad().left, attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, - attributes.dimension_rounding_type()); - const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + const PadStrideInfo pad_stride_info = + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type()); + const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), + attributes.dilation()}; + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); @@ -168,19 +169,22 @@ Status ClComponentDepthwiseConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1); - if(conv_info.depth_multiplier > 1 && settings.n0() > 1) + if (conv_info.depth_multiplier > 1 && settings.n0() > 1) { ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0); } // Check export weights to cl image - ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && + (export_to_cl_image(wei) == false), + "Weights cannot be exported to cl_image!"); ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0)); - ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier)); + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != + (src->dimension(channel_idx) * conv_info.depth_multiplier)); // bia shape is correct - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx], "Biases size and number of dst feature maps should match"); @@ -198,14 +202,13 @@ Status ClComponentDepthwiseConv2d::validate( return Status{}; } -ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors, attributes, settings) } +ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique(id, tensors, attributes, settings)} { } ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d() diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h index 0e2b5f14cb..b3e1bd222d 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h @@ -25,7 +25,9 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D #include "arm_compute/core/Error.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + #include namespace arm_compute @@ -77,12 +79,12 @@ public: unsigned int m0() const; private: - bool _export_input_to_cl_image{ false }; /**< Export input to cl_image */ - bool _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */ - bool _fast_relaxed_math{ true }; /**< Enable/disable -cl-fast-relaxed-math flag */ - bool _is_fma_available{ false }; /**< Is fma instruction available */ - unsigned int _n0{ 0 }; /**< Number of columns processed by each thread */ - unsigned int _m0{ 0 }; /**< Number of rows processed by each thread */ + bool _export_input_to_cl_image{false}; /**< Export input to cl_image */ + bool _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */ + bool _fast_relaxed_math{true}; /**< Enable/disable -cl-fast-relaxed-math flag */ + bool _is_fma_available{false}; /**< Is fma instruction available */ + unsigned int _n0{0}; /**< Number of columns processed by each thread */ + unsigned int _m0{0}; /**< Number of rows processed by each thread */ }; /** Forward declaration */ @@ -127,22 +129,20 @@ public: * |F16 |F16 |F16 |F16 | * |F32 |F32 |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * * Similar to @ref ClComponentDepthwiseConv2d::validate() */ - ClComponentDepthwiseConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + ClComponentDepthwiseConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Destructor */ ~ClComponentDepthwiseConv2d() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp index a713c82003..98f3d6a882 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -23,8 +23,8 @@ */ #include "ClComponentDirectConv2d.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" #include "src/core/CL/CLValidate.h" @@ -57,7 +57,8 @@ bool ClComponentDirectConv2dSettings::fast_relaxed_math() const return _fast_relaxed_math; } -ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc) +ClComponentDirectConv2dSettings & +ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc) { _desc = desc; return *this; @@ -68,11 +69,10 @@ DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descrip return _desc; } -Status ClComponentDirectConv2d::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentDirectConv2d::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -86,7 +86,7 @@ Status ClComponentDirectConv2d::validate( // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); } @@ -94,7 +94,7 @@ Status ClComponentDirectConv2d::validate( // Matching data layout ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); } @@ -103,7 +103,7 @@ Status ClComponentDirectConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); } @@ -112,22 +112,23 @@ Status ClComponentDirectConv2d::validate( // wei shape is correct const DataLayout data_layout = src->data_layout(); const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional"); // dst shape is correct - PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top, - attributes.pad().bottom, DimensionRoundingType{}); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride)); + PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType{}); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride)); // bia shape is correct - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3), "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, - "Biases should be one dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional"); } // 2. Check support level @@ -137,24 +138,25 @@ Status ClComponentDirectConv2d::validate( ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); const auto desc = settings.direct_conv_descriptor(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, "N0 can only be: 1, 2, 3, 4, 8, and 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, "K0 can only be: 1, 2, 3, 4, 8, and 16"); return Status{}; } -ClComponentDirectConv2d::ClComponentDirectConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes, settings) } -#else // ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes, settings) } + _component_writer{std::make_unique(id, tensors, attributes, settings)} +#else // ACL_INTERNAL_TEST_CKW_IN_DF + _component_writer{std::make_unique(id, tensors, attributes, settings)} #endif // ACL_INTERNAL_TEST_CKW_IN_DF { } @@ -165,7 +167,7 @@ ClComponentDirectConv2d::~ClComponentDirectConv2d() #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const #endif // ACL_INTERNAL_TEST_CKW_IN_DF { diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h index 24acb1b2c1..d6d9705d3c 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h @@ -26,7 +26,9 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + #include namespace arm_compute @@ -61,7 +63,7 @@ public: DirectConvComputeKernelInfo direct_conv_descriptor() const; private: - bool _fast_relaxed_math{ true }; + bool _fast_relaxed_math{true}; DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor }; @@ -111,22 +113,20 @@ public: * |F16 |F16 |F16 |F16 | * |F32 |F32 |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * * Similar to @ref ClComponentDirectConv2d::validate() */ - ClComponentDirectConv2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + ClComponentDirectConv2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Destructor */ ~ClComponentDirectConv2d() override; @@ -142,7 +142,7 @@ public: #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *template_writer() const override; #else // ACL_INTERNAL_TEST_CKW_IN_DF - const IGpuCkwComponentDriver *ckw_component_driver() const override; + const IGpuCkwComponentDriver *ckw_component_driver() const override; #endif // ACL_INTERNAL_TEST_CKW_IN_DF /** Get component type */ GpuComponentType type() const override diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp index 88d729170c..5b136427e4 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp @@ -24,6 +24,7 @@ #include "ClComponentElementwiseBinary.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #ifndef ACL_INTERNAL_TEST_CKW_IN_DF #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h" @@ -39,56 +40,55 @@ namespace dynamic_fusion { namespace { -std::set supported_ops -{ - ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, - ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub, - ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul -}; +std::set supported_ops{ + ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub, + ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul}; } -Status ClComponentElementwiseBinary::validate(const ArgumentPack &tensors, const ElementwiseBinaryCommonAttributes &attributes) +Status ClComponentElementwiseBinary::validate(const ArgumentPack &tensors, + const ElementwiseBinaryCommonAttributes &attributes) { const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); // Check operator type - ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), + "Provided Elementwise operation not supported."); // Check validity ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); //Check data type for different elementwise operators - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, + DataType::S16, DataType::U8); // dst shape is correct const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst."); const auto &lhs_shape = lhs->tensor_shape(); const auto &rhs_shape = rhs->tensor_shape(); const auto &dst_shape = dst->tensor_shape(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(lhs_shape, dst_shape, 0) && detail::have_different_dimensions(rhs_shape, dst_shape, 0), - "Only LHS or RHS can be broadcasting, not both."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) && + detail::have_different_dimensions(rhs_shape, dst_shape, 0), + "Only LHS or RHS can be broadcasting, not both."); // Dimension Y and Z are collapsed together in the current kernel implementation, // hence they cannot be independently broadcast or non-broadcast. // See: ClTemplateElementwiseBinary::get_window - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]), - "Dimension Y and Z must both be either broadcast or non-broadcast."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != + (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]), + "Dimension Y and Z must both be either broadcast or non-broadcast."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(lhs_shape, dst_shape, 3), - "LHS broadcast in dimension 3 or higher is not supported."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3), + "LHS broadcast in dimension 3 or higher is not supported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - detail::have_different_dimensions(rhs_shape, dst_shape, 3), - "RHS broadcast in dimension 3 or higher is not supported."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3), + "RHS broadcast in dimension 3 or higher is not supported."); // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); @@ -112,22 +112,15 @@ Status ClComponentElementwiseBinary::validate(const ArgumentPack &t ClComponentElementwiseBinary::~ClComponentElementwiseBinary() { } -ClComponentElementwiseBinary::ClComponentElementwiseBinary( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes) -} + _component_writer{std::make_unique(id, tensors, attributes)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h index f7175903d0..7589b9732c 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h @@ -82,17 +82,17 @@ public: * |S16 |S16 |S16 | * |U8 |U8 |U8 | */ - static Status validate(const ArgumentPack &tensors, const ElementwiseBinaryCommonAttributes &attributes); + static Status validate(const ArgumentPack &tensors, + const ElementwiseBinaryCommonAttributes &attributes); /** Constructor * * Similar to @ref ClComponentElementwiseBinary::validate() */ - ClComponentElementwiseBinary( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + ClComponentElementwiseBinary(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes); /** Destructor */ ~ClComponentElementwiseBinary() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp index 279c77e227..27c13bd654 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp @@ -25,9 +25,10 @@ #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h" @@ -37,10 +38,9 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentLogits1DMaxShiftExpSum::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes) +Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) { ARM_COMPUTE_UNUSED(properties, attributes); @@ -75,8 +75,8 @@ ClComponentLogits1DMaxShiftExpSum::ClComponentLogits1DMaxShiftExpSum(ComponentId const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors, attributes) } + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique(id, tensors, attributes)} { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h index b5db458248..91ab5de3b5 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -89,10 +90,8 @@ public: * |F16 | F16 | F16 | * |F32 | F32 | F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp index 7864d56d29..fb2544385c 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp @@ -25,9 +25,10 @@ #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h" #include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h" @@ -37,10 +38,9 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentLogits1DNorm::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes) +Status ClComponentLogits1DNorm::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) { ARM_COMPUTE_UNUSED(properties, attributes); @@ -77,8 +77,8 @@ ClComponentLogits1DNorm::ClComponentLogits1DNorm(ComponentId const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors, attributes) } + : IGpuKernelComponent{id, properties, tensors}, + _component_writer{std::make_unique(id, tensors, attributes)} { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h index 5bd350b9bd..74c0273604 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -86,10 +87,8 @@ public: * |F16 | F16 | F16 | * |F32 | F32 | F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp index d415769094..409b191df5 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp @@ -24,13 +24,15 @@ #include "ClComponentPool2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h" #include "src/dynamic_fusion/utils/Utils.h" + #include namespace arm_compute @@ -39,23 +41,24 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentPool2d::validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) +Status ClComponentPool2d::validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) { ARM_COMPUTE_UNUSED(properties); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), "Unsupported Pooling type"); + ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), + "Unsupported Pooling type"); // 1. Check validity // Check if pooling is valid - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())), - "Pooling region that is entirely outside input tensor is unsupported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())), + "Pooling region that is entirely outside input tensor is unsupported"); // Matching data type ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); @@ -70,8 +73,9 @@ Status ClComponentPool2d::validate( // Device requirements are met ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()))); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_pool_shape( + *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()))); // 2. Check support level // Data type @@ -83,23 +87,16 @@ Status ClComponentPool2d::validate( return Status{}; } -ClComponentPool2d::ClComponentPool2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentPool2d::ClComponentPool2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes, settings) -} + _component_writer{std::make_unique(id, tensors, attributes, settings)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors, attributes, settings) -} + _component_writer{std::make_unique(id, tensors, attributes, settings)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h index 6814bf9243..98fed65004 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -82,11 +83,10 @@ public: * |F16 |F16 | * |F32 |F32 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + static Status validate(const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Constructor * @@ -96,12 +96,11 @@ public: * @param[in] attributes Component attributes * @param[in] settings Component settings */ - ClComponentPool2d( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes, - const Settings &settings); + ClComponentPool2d(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); /** Destructor */ ~ClComponentPool2d() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp index 66e2ee6956..0ece9de970 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "ClComponentReshape.h" + #include "arm_compute/core/Error.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h" @@ -49,12 +51,10 @@ Status ClComponentReshape::validate(const ArgumentPack &tensors) return Status{}; } -ClComponentReshape::ClComponentReshape( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors) - : IGpuKernelComponent{ id, properties, tensors }, - _component_writer{ std::make_unique(id, tensors) } +ClComponentReshape::ClComponentReshape(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors) + : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique(id, tensors)} { } ClComponentReshape::~ClComponentReshape() diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h index f8d165b4c8..78163d6603 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h @@ -73,10 +73,7 @@ public: * @param[in] properties Component properties @ref Properties * @param[in] tensors Tensor arguments to the component */ - ClComponentReshape( - ComponentId id, - const Properties &properties, - const ArgumentPack &tensors); + ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack &tensors); /** Destructor */ ~ClComponentReshape() override; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp index 6df1d9b3db..b05eb04698 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp @@ -66,7 +66,9 @@ Status ClComponentResize::validate(const IGpuKernelComponent::Properties &proper ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); // Align corners and sampling policy conformance - ARM_COMPUTE_RETURN_ERROR_ON(attributes.align_corners() && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy())); + ARM_COMPUTE_RETURN_ERROR_ON( + attributes.align_corners() && + !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy())); // All tensor infos are initialized ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); @@ -79,11 +81,11 @@ ClComponentResize::ClComponentResize(ComponentId id, const IGpuKernelComponent::Properties &properties, const ArgumentPack &tensors, const ClComponentResize::Attributes &attributes) - : IGpuKernelComponent{ id, properties, tensors }, + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes) } -#else // ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer{ std::make_unique(id, tensors, attributes) } + _component_writer{std::make_unique(id, tensors, attributes)} +#else // ACL_INTERNAL_TEST_CKW_IN_DF + _component_writer{std::make_unique(id, tensors, attributes)} #endif // ACL_INTERNAL_TEST_CKW_IN_DF { } @@ -94,7 +96,7 @@ ClComponentResize::~ClComponentResize() #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const #endif // ACL_INTERNAL_TEST_CKW_IN_DF { diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h index 474524f8fc..29276c3257 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h @@ -26,6 +26,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" namespace arm_compute @@ -43,7 +44,7 @@ class ArgumentPack; /** Forward declaration */ #ifndef ACL_INTERNAL_TEST_CKW_IN_DF class ClTemplateResize; -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF class GpuCkwResize; #endif // ACL_INTERNAL_TEST_CKW_IN_DF @@ -82,10 +83,8 @@ public: * |U8 |U8 | * |S16 |S16 | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors, - const Attributes &attributes); + static Status + validate(const Properties &properties, const ArgumentPack &tensors, const Attributes &attributes); /** Constructor * @@ -114,7 +113,7 @@ public: /** Get writer for the component */ #ifndef ACL_INTERNAL_TEST_CKW_IN_DF const IGpuTemplateComponentWriter *template_writer() const override; -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF const IGpuCkwComponentDriver *ckw_component_driver() const override; #endif // ACL_INTERNAL_TEST_CKW_IN_DF @@ -127,7 +126,7 @@ public: private: #ifndef ACL_INTERNAL_TEST_CKW_IN_DF std::unique_ptr _component_writer; -#else // ACL_INTERNAL_TEST_CKW_IN_DF +#else // ACL_INTERNAL_TEST_CKW_IN_DF std::unique_ptr _component_writer; #endif // ACL_INTERNAL_TEST_CKW_IN_DF }; diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp index 12b81c3d56..dcbecaff35 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp @@ -38,25 +38,19 @@ namespace experimental { namespace dynamic_fusion { -Status ClComponentStore::validate( - const Properties &properties, - const ArgumentPack &tensors) +Status ClComponentStore::validate(const Properties &properties, const ArgumentPack &tensors) { ARM_COMPUTE_UNUSED(properties, tensors); return Status{}; } -ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack &tensors) - : IGpuKernelComponent{ id, properties, tensors }, +ClComponentStore::ClComponentStore(ComponentId id, + const Properties &properties, + const ArgumentPack &tensors) + : IGpuKernelComponent{id, properties, tensors}, #ifndef ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors) -} + _component_writer{std::make_unique(id, tensors)} #else //ACL_INTERNAL_TEST_CKW_IN_DF - _component_writer -{ - std::make_unique(id, tensors) -} + _component_writer{std::make_unique(id, tensors)} #endif //ACL_INTERNAL_TEST_CKW_IN_DF { } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h index 853ee39012..948785c480 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" + #include namespace arm_compute @@ -70,9 +71,7 @@ public: * |:--------------|:--------------| * |All |All | */ - static Status validate( - const Properties &properties, - const ArgumentPack &tensors); + static Status validate(const Properties &properties, const ArgumentPack &tensors); /** Constructor * * Similar to @ref ClComponentStore::validate() diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h index bc7133f4df..4c3e84e59d 100644 --- a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h @@ -46,18 +46,16 @@ using namespace experimental::dynamic_fusion; */ inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op) { - const std::map op_name = - { - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff" }, - { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub" } - }; + const std::map op_name = { + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"}, + {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}}; os << op_name.at(op); return os; } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp index e7ee1c10df..2cec67dc65 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" + #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" @@ -32,12 +33,11 @@ namespace experimental { namespace dynamic_fusion { -Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Add then call the elementwise common validate_op @@ -46,12 +46,11 @@ Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); } -Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Add then call the elementwise common is_supported_op @@ -60,9 +59,7 @@ Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); } -ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *lhs, - ITensorInfo *rhs) +ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) { // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op() // Set the elementwise operation to Add then call the elementwise common create_op diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp index 33c2d43e07..6f35e66ea8 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp @@ -23,12 +23,11 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h" - -#include "src/common/utils/Log.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -49,7 +48,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -58,25 +57,22 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check support level // Data Type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); - - if(context.gpu_language() == GpuLanguage::OpenCL) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, + DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8, + DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32, + DataType::S32, DataType::F16, DataType::F32); + + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Cast Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentCast::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentCast::Settings(); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -94,16 +90,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuCast::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const CastAttributes &attributes) +Status +GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const CastAttributes &attributes) +Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -127,9 +120,7 @@ Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const CastAttributes &attributes) +ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes); @@ -145,14 +136,15 @@ ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); const auto *sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr); // Add Depthwise Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentCast::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentCast::Settings(); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp index 89b533c9b8..697b7d4e1f 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp @@ -25,14 +25,13 @@ #include "arm_compute/core/experimental/Types.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace experimental @@ -48,12 +47,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), "Maximum clamp value cannot be lower than minimum value"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), + "Maximum clamp value cannot be lower than minimum value"); TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -61,16 +61,15 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, auto_init_if_empty(dst_info_to_validate, *src->clone()); // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped - const ClComponentActivation::Attributes act_info - { - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val() - }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + attributes.max_val(), attributes.min_val()}; // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { // Validate Activation Component - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); @@ -87,16 +86,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuClamp::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ClampAttributes &attributes) +Status +GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const ClampAttributes &attributes) +Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -121,9 +117,7 @@ Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const ClampAttributes &attributes) +ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes); @@ -139,18 +133,16 @@ ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped - const ClComponentActivation::Attributes act_info - { - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val() - }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + attributes.max_val(), attributes.min_val()}; const auto *const sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { // Add Activation Component auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp index cb270ed4b0..aaeec543f8 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -24,15 +24,15 @@ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" @@ -45,24 +45,30 @@ namespace dynamic_fusion { namespace { -DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr t = arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target); + std::unique_ptr t = + arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target); return t->configure(src, weights, conv_info); } -void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const Conv2dAttributes &attributes) +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const ITensorInfo *wei, + const Conv2dAttributes &attributes) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - const auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wei->tensor_shape(), - PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + const auto shape = misc::shape_calculator::compute_deep_convolution_shape( + src->tensor_shape(), src->data_layout(), wei->tensor_shape(), + PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } @@ -83,7 +89,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -98,18 +104,20 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check components const auto gpu_target = context.gpu_target(); - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Direct Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentDirectConv2d::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDirectConv2d::Settings(); settings.fast_relaxed_math( - (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16)); + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst_info_to_validate_ptr->data_type() == DataType::F32 || + dst_info_to_validate_ptr->data_type() == DataType::F16)); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -142,14 +150,14 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *wei, const ITensorInfo *bia, - const Conv2dAttributes &attributes) + const Conv2dAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported"); // Check if tensors have valid id. I.e. they are created from a sketch ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id()); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); } @@ -178,16 +186,13 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes); } -ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - ITensorInfo *wei, - ITensorInfo *bia, - const Conv2dAttributes &attributes) +ITensorInfo *GpuConv2d::create_op( + GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes) { ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes); PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, + DimensionRoundingType::FLOOR); // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info); @@ -207,7 +212,7 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, const auto gpu_target = sketch_ctx->gpu_target(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); @@ -216,17 +221,17 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, // Add Direct Conv2d Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); auto settings = ClComponentDirectConv2d::Settings(); settings.fast_relaxed_math( - (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); settings.direct_conv_descriptor(desc); - if(settings.export_to_cl_image()) + if (settings.export_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp index c72098e943..e2b673bd43 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -28,8 +28,8 @@ #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" @@ -42,20 +42,20 @@ namespace dynamic_fusion { namespace { -void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes) +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const ITensorInfo *wei, + const DepthwiseConv2dAttributes &attributes) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - const PadStrideInfo pad_stride_info(attributes.stride().x(), - attributes.stride().y(), - attributes.pad().left, - attributes.pad().right, - attributes.pad().top, - attributes.pad().bottom, + const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type()); - const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; - const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), + attributes.dilation()}; + const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } @@ -76,7 +76,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -91,40 +91,44 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, const GpuTarget gpu_target = context.gpu_target(); - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const CLCompileContext *cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Depthwise Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentDepthwiseConv2d::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDepthwiseConv2d::Settings(); - const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType::FLOOR); // Get the depthwise convolution compute parameters - auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = + t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.fast_relaxed_math( - (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16)); + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (dst_info_to_validate_ptr->data_type() == DataType::F32 || + dst_info_to_validate_ptr->data_type() == DataType::F16)); settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - .m0(dwc_info.m0) - .n0(dwc_info.n0) - .export_input_to_cl_image(dwc_info.export_input_to_cl_image) - .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); arguments.add_const_tensor(ACL_SRC_1, wei); arguments.add_const_tensor(ACL_SRC_2, bia); arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr); - ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); } } else @@ -158,7 +162,7 @@ Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch &sketch, ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id()); - if(bia != nullptr) + if (bia != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); } @@ -205,35 +209,37 @@ ITensorInfo *GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sket const auto *sketch_ctx = sketch.implementation().context(); const GpuTarget gpu_target = sketch_ctx->gpu_target(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); // Add Depthwise Conv2d Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); - auto settings = ClComponentDepthwiseConv2d::Settings(); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); + auto settings = ClComponentDepthwiseConv2d::Settings(); - const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, attributes.pad().top, + attributes.pad().bottom, DimensionRoundingType::FLOOR); // Get the depthwise convolution compute parameters - auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); - const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = + t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) - .m0(dwc_info.m0) - .n0(dwc_info.n0) - .export_input_to_cl_image(dwc_info.export_input_to_cl_image) - .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); - if(settings.export_input_to_cl_image()) + if (settings.export_input_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src); } - if(settings.export_weights_to_cl_image()) + if (settings.export_weights_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp index 464a32cbad..b871171e8d 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h" + #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" @@ -32,9 +33,7 @@ namespace experimental { namespace dynamic_fusion { -Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); @@ -46,9 +45,7 @@ Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); } -Status GpuMul::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); @@ -60,9 +57,7 @@ Status GpuMul::is_supported_op(const GpuWorkloadContext &context, return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); } -ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *lhs, - ITensorInfo *rhs) +ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) { // Set the elementwise operation to Mul then call the elementwise common create_op ElementwiseBinaryCommonAttributes common_attributes{}; diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp index 107a5e5fa7..f0d368d757 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp @@ -26,10 +26,9 @@ #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" - #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/utils/Utils.h" namespace arm_compute @@ -43,9 +42,7 @@ namespace constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ITensorInfo *dst) +Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -60,9 +57,7 @@ Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, return Status{}; } -Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const ITensorInfo *dst) +Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -90,9 +85,7 @@ Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, return status; } -void GpuOutput::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - ITensorInfo *dst) +void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst) { ARM_COMPUTE_LOG_PARAMS(src, dst); ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst)); @@ -104,14 +97,14 @@ void GpuOutput::create_op(GpuWorkloadSketch &sketch, auto &comp_graph = sketch.implementation().component_graph(); const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr); // Add store component { IGpuKernelComponent::Properties properties; - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp index 7ecfa0158b..55c604aacc 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp @@ -22,20 +22,21 @@ * SOFTWARE. */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "arm_compute/core/CL/CLCompileContext.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h" - #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" -#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/utils/Utils.h" namespace arm_compute @@ -46,11 +47,15 @@ namespace dynamic_fusion { namespace { -void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const Pool2dAttributes &attributes, const GpuPool2dSettings &settings) +void calculate_and_init_dst_if_empty(ITensorInfo *dst, + const ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())); + auto shape = misc::shape_calculator::compute_pool_shape( + *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } } @@ -82,7 +87,7 @@ bool GpuPool2dSettings::use_inf_as_limit() const Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, - const Pool2dAttributes &attributes, + const Pool2dAttributes &attributes, const GpuPool2dSettings &settings) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -110,7 +115,7 @@ Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch, Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Pool2dAttributes &attributes, - const GpuPool2dSettings &settings) + const GpuPool2dSettings &settings) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); // Data type @@ -118,7 +123,8 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, // Data layout ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); // Check exclude padding is not false - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), "Exclude padding must be set to true in Attributes!"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), + "Exclude padding must be set to true in Attributes!"); // Auto initialize dst tensor info TensorInfo dst_info_to_validate; @@ -126,14 +132,15 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Component { - const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -148,10 +155,10 @@ Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context, return Status{}; } -ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const Pool2dAttributes &attributes, - const GpuPool2dSettings &settings) +ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + const Pool2dAttributes &attributes, + const GpuPool2dSettings &settings) { // Assert validation ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings)); @@ -168,7 +175,7 @@ ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_UNUSED(cl_compile_ctx); @@ -177,7 +184,7 @@ ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch &sketch, // Add Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp index 0f43a578df..3def7a1a81 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h" + #include "arm_compute/core/Error.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -40,14 +42,14 @@ namespace Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst, - const ReshapeAttributes &attributes) + const ReshapeAttributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -55,7 +57,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape())); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); @@ -78,16 +80,13 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, GpuOperatorType operator_type = GpuOperatorType::Complex; } // namespace -Status GpuReshape::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const Attributes &attributes) +Status +GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const Attributes &attributes) +Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -111,9 +110,7 @@ Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const Attributes &attributes) +ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes.shape()); @@ -127,7 +124,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, // Translate into components and add to component graph auto &comp_graph = sketch.implementation().component_graph(); const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_UNUSED(cl_compile_ctx); @@ -136,7 +133,7 @@ ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, // Add ElementwiseBinary Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp index 5f52eea7d0..fb09875b33 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp @@ -26,12 +26,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h" - -#include "src/common/utils/Log.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -43,7 +43,7 @@ namespace { void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { TensorShape out_shape = src->tensor_shape(); @@ -64,7 +64,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -73,22 +73,25 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, // Check support level // Data type - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::U8, DataType::S16, DataType::F16, DataType::F32); // Data layout ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); // Interpolation policy - ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && attributes.interpolation_policy() != InterpolationPolicy::BILINEAR, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && + attributes.interpolation_policy() != InterpolationPolicy::BILINEAR, "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR"); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); // Validate Activation Component { - const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); @@ -107,16 +110,14 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; } // namespace -Status GpuResize::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src, - const Attributes &attributes) +Status +GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes) { return is_supported_op_helper(context, src, nullptr, attributes); } -Status GpuResize::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src, - const GpuResize::Attributes &attributes) +Status +GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id()); @@ -141,9 +142,7 @@ Status GpuResize::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - const GpuResize::Attributes &attributes) +ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, attributes); @@ -159,13 +158,14 @@ ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); const auto *sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); // Add Resize Component { - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp index 09debad969..a2260c8c36 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp @@ -23,14 +23,15 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h" + #include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -40,9 +41,7 @@ namespace dynamic_fusion { namespace { -Status is_supported_op_helper(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ITensorInfo *dst) +Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); @@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } auto_init_if_empty(dst_info_to_validate, *src->clone()); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC}; // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { // Validate Activation Component - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); @@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src) +Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src) { return is_supported_op_helper(context, src, nullptr); } -Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src) +Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -112,8 +110,7 @@ Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate); } -ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src) +ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src); @@ -128,15 +125,15 @@ ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, // Translate into components and add to component graph GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC}; const auto *const sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { // Add Activation Component auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp index ffc4553a7d..c87b282aec 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp @@ -22,13 +22,14 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h" + #include "arm_compute/core/Error.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h" #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" @@ -52,7 +53,7 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; // Auto initialize dst tensor info - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate = *dst; } @@ -61,11 +62,12 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, auto_init_if_empty(dst_info_to_validate, *src->clone()); } // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); - const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const KernelProperties properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); TensorShape logits_sum_shape = src->tensor_shape(); TensorInfo logits(src->clone()->set_tensor_shape(logits_sum_shape)); @@ -86,7 +88,8 @@ Status GpuSoftmax::is_supported_op(const GpuWorkloadContext &context, arguments_norm.add_const_tensor(ACL_SRC_1, &sum); arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate); - ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes)); ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes)); } else @@ -105,14 +108,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast(-src->num_dimensions()) || static_cast(src->num_dimensions()) <= attributes.axis()); + ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast(-src->num_dimensions()) || + static_cast(src->num_dimensions()) <= attributes.axis()); // Auto initialize dst tensor info TensorInfo dst_info_to_validate = *dst; auto_init_if_empty(dst_info_to_validate, *src->clone()); - const size_t actual_axis = static_cast(wrap_around(attributes.axis(), static_cast(src->num_dimensions()))); - const bool needs_permute = actual_axis != 0; + const size_t actual_axis = + static_cast(wrap_around(attributes.axis(), static_cast(src->num_dimensions()))); + const bool needs_permute = actual_axis != 0; ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet."); // Perform fusion test and check if the operator meets the fusion constraints @@ -128,17 +133,16 @@ Status GpuSoftmax::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes); } -void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src, - ITensorInfo *dst, - const Attributes &attributes) +void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, dst, attributes); TensorShape logits_sum_shape = src->tensor_shape(); - ITensorInfo *logits = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); + ITensorInfo *logits = sketch.implementation().create_auxiliary_tensor( + src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); logits_sum_shape.set(0, 1); - ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); + ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor( + src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape)); // Auto initialize dst tensor info and the auxiliary tensor infos as well auto_init_if_empty(*dst, *src->clone()); @@ -151,7 +155,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, auto &comp_graph = sketch.implementation().component_graph(); const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); ARM_COMPUTE_UNUSED(cl_compile_ctx); @@ -160,7 +164,7 @@ void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, // Add Direct Conv2d Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments_exp_sum; ArgumentPack arguments_norm; diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp index 8240008f2a..e5d62c9930 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h" + #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" @@ -32,12 +33,11 @@ namespace experimental { namespace dynamic_fusion { -Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Sub then call the elementwise common validate_op @@ -46,12 +46,11 @@ Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes); } -Status GpuSub::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *lhs, - const ITensorInfo *rhs) +Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, + DataType::S16, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type"); // Set the elementwise operation to Sub then call the elementwise common is_supported_op @@ -60,9 +59,7 @@ Status GpuSub::is_supported_op(const GpuWorkloadContext &context, return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes); } -ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *lhs, - ITensorInfo *rhs) +ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs) { // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op() // Set the elementwise operation to Sub then call the elementwise common create_op diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp index c00716c76e..bf0f274c5c 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp @@ -23,14 +23,15 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h" + #include "arm_compute/core/experimental/Types.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/common/utils/Log.h" namespace arm_compute { @@ -40,9 +41,7 @@ namespace dynamic_fusion { namespace { -Status is_supported_op_helper(const GpuWorkloadContext &context, - const ITensorInfo *src, - const ITensorInfo *dst) +Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); @@ -50,20 +49,21 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } auto_init_if_empty(dst_info_to_validate, *src->clone()); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH}; // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { // Validate Activation Component - const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + const auto properties = + IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); @@ -80,14 +80,12 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, constexpr GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, - const ITensorInfo *src) +Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src) { return is_supported_op_helper(context, src, nullptr); } -Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, - const ITensorInfo *src) +Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -112,8 +110,7 @@ Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate); } -ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, - ITensorInfo *src) +ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src); @@ -128,15 +125,15 @@ ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, // Translate into components and add to component graph GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); - const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH }; + const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH}; const auto *const sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { // Add Activation Component auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC, src); diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp index 7c087c9a7b..d79a4c42c9 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp @@ -22,11 +22,12 @@ * SOFTWARE. */ #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" namespace arm_compute { @@ -38,9 +39,10 @@ namespace { void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs) { - if(dst->total_size() == 0U) + if (dst->total_size() == 0U) { - const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); + const std::pair broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first)); } } @@ -56,7 +58,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - if(dst != nullptr) + if (dst != nullptr) { dst_info_to_validate_ptr = dst; } @@ -64,7 +66,7 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); // Check components - if(context.gpu_language() == GpuLanguage::OpenCL) + if (context.gpu_language() == GpuLanguage::OpenCL) { const auto cl_compile_ctx = context.cl_compile_context(); ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); @@ -90,7 +92,8 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, GpuOperatorType operator_type = GpuOperatorType::Simple; } // namespace -ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) +ElementwiseBinaryCommonAttributes & +ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) { _operation = operation; return *this; @@ -157,14 +160,14 @@ ITensorInfo *GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch const auto sketch_ctx = sketch.implementation().context(); - if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL) { ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context()); // Add ElementwiseBinary Component { auto properties = IGpuKernelComponent::Properties(); - properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run}); ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, lhs); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp index 0972b4e8e2..775b0a0c8c 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "GpuKernelVariableTable.h" + #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" namespace arm_compute @@ -32,14 +34,17 @@ namespace experimental { namespace dynamic_fusion { -void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias) +void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, + const ITensorInfo *tensor, + GpuKernelArgumentInfo argument_info, + const std::string &alias) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); // Do not re-declare if the variable associated with the tensor has already been declared auto it = _vars.find(tensor->id()); - if(it != _vars.end()) + if (it != _vars.end()) { ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info)); return; @@ -47,14 +52,12 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com const auto target = comp_group.get_tile_for_tensor(tensor); - if(target != tensor) + if (target != tensor) { // If the tensor uses a shared tile, don't declare another variable. it = _vars.find(target->id()); - ARM_COMPUTE_ERROR_ON_MSG( - it == _vars.end(), - "The variable used for this tensor must have been declared."); + ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared."); _vars[tensor->id()] = it->second; } @@ -64,7 +67,7 @@ void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &com std::stringstream ss; ss << alias << "_t" << abs(tensor->id()); const auto uniq_name = ss.str(); - TensorVariable var{ tensor->id(), uniq_name, argument_info }; + TensorVariable var{tensor->id(), uniq_name, argument_info}; _vars.emplace(tensor->id(), var); } @@ -76,12 +79,13 @@ GpuKernelVariableTable::TensorVariable GpuKernelVariableTable::get_variable(cons return var; } -GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector &tensors) const +GpuKernelVariableTable::VariableList +GpuKernelVariableTable::get_variable_list(const std::vector &tensors) const { VariableList vars{}; - for(const auto &tensor : tensors) + for (const auto &tensor : tensors) { - if(!tensor->has_valid_id()) + if (!tensor->has_valid_id()) { continue; } @@ -90,23 +94,19 @@ GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(c return vars; } -TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) - : value{ var.uniq_name } +TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name} { } -TagVal::TagVal(const std::string &val) - : value{ val } +TagVal::TagVal(const std::string &val) : value{val} { } -TagVal::TagVal(const char *val) - : value{ std::string(val) } +TagVal::TagVal(const char *val) : value{std::string(val)} { } -TagVal::TagVal(const DataType &data_type) - : value{ get_cl_type_from_data_type(data_type) } +TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)} { } } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h index a49d38e10c..c17f131ada 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "support/AclRequires.h" #include "support/StringSupport.h" @@ -55,11 +56,11 @@ public: struct TensorVariable { public: - TensorVariable() = default; - TensorVariable(const TensorVariable &) = default; + TensorVariable() = default; + TensorVariable(const TensorVariable &) = default; TensorVariable &operator=(const TensorVariable &) = default; - ITensorInfo::Id id{ ITensorInfo::invalid_tensor_id }; - std::string uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code + ITensorInfo::Id id{ITensorInfo::invalid_tensor_id}; + std::string uniq_name{"empty"}; // Unique name, also the final variable name used in the built code GpuKernelArgumentInfo kernel_argument_info{}; bool has_valid_id() const { @@ -76,7 +77,10 @@ public: * @param[in] argument_info Kernel argument information * @param[in] alias Alias for the variable. Will be used as part of the variable name */ - void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias = "unnamed"); + void declare_variable(const GpuKernelComponentGroup &comp_group, + const ITensorInfo *tensor, + GpuKernelArgumentInfo argument_info, + const std::string &alias = "unnamed"); /** Get the @ref TensorVariable associated with @p tensor * * @param[in] tensor Tensor info to be queried @@ -106,8 +110,7 @@ struct TagVal TagVal(const GpuKernelVariableTable::TensorVariable &var); /** Construct a @ref TagVal from an integral type */ template ::value)> - TagVal(T val) - : value{ support::cpp11::to_string(val) } + TagVal(T val) : value{support::cpp11::to_string(val)} { } /** Construct a @ref TagVal from a string */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h index 4a1fb142d6..9d0b4f592a 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h @@ -27,6 +27,7 @@ #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/components/Types.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" @@ -57,8 +58,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack &tensors) - : _id{ id }, _tensors{ tensors } + IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack &tensors) : _id{id}, _tensors{tensors} { } /** Destructor */ @@ -112,7 +112,7 @@ public: /** Generate the header list used in the component */ virtual std::set get_headers_list() const { - return std::set {}; + return std::set{}; } /** Generate the execution window for the component */ virtual Window get_window() const @@ -131,7 +131,7 @@ public: } private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; ArgumentPack _tensors{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp index 3c7c843dd8..c165fb5f33 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -39,10 +40,7 @@ namespace dynamic_fusion ClTemplateActivation::ClTemplateActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); @@ -62,7 +60,7 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_ code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- )_"; - if(is_root) + if (is_root) { code += R"_( // IN(src) {{src}} @@ -104,17 +102,11 @@ LOOP_UNROLLING(int, i, 0, 1, M0, void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -173,7 +165,7 @@ std::string ClTemplateActivation::get_config_id() const std::set ClTemplateActivation::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h", "activation_float_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"}; } Window ClTemplateActivation::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h index ec78cf6ce5..88ee370342 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp index 4956879ad3..0da3a73801 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -35,7 +36,7 @@ namespace experimental namespace dynamic_fusion { ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -62,7 +63,7 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group) //------------------ START KERNEL {{meta_kernel_id}} CAST --------------------- )_"; - if(is_root) + if (is_root) { code += R"_( // IN_0(src) {{src}} @@ -82,14 +83,15 @@ TILE(uint, M0, 1, g_dst_indirect_y); { )_"; - if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type())) + if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type())) { code += R"_( {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80; )_"; } - if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) + if (kernel_name == "cast_down" && + (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) { code += R"_( {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0)); @@ -106,7 +108,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); }) )_"; - if(is_root) + if (is_root) { code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, @@ -128,17 +130,11 @@ TILE(uint, M0, 1, g_dst_indirect_y); void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -199,7 +195,7 @@ std::string ClTemplateCast::get_config_id() const std::set ClTemplateCast::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateCast::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp index ab7cc9f05a..8380620ab2 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -36,17 +36,17 @@ ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _weight{}, _bias{}, _dst{}, - _attributes{ attributes }, - _settings{ settings } + _attributes{attributes}, + _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); - if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) { _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); } @@ -71,7 +71,7 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup & // IN_1(wei) {{weight}} )_"; - if(_bias != nullptr && _bias->has_valid_id()) + if (_bias != nullptr && _bias->has_valid_id()) { code += R"_( // IN_1(bia) {{bias}} @@ -113,7 +113,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); }) )_"; - if(_weight->dimension(height_idx) < 5) + if (_weight->dimension(height_idx) < 5) { code += R"_( LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, @@ -147,7 +147,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); { )_"; - if(!_settings.is_fma_available()) + if (!_settings.is_fma_available()) { code += R"_( {{dst}}[m0].v += a[xk + m0].v * b[xk].v; @@ -166,14 +166,14 @@ TILE(uint, M0, 1, g_dst_indirect_y); } )_"; - if(_weight->dimension(height_idx) < 5) + if (_weight->dimension(height_idx) < 5) { code += R"_( ) )_"; } - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}}); @@ -198,44 +198,31 @@ TILE(uint, M0, 1, g_dst_indirect_y); return code; } -void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ? - GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : - GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(input_type), - "src"); - - const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ? - GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : - GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - - vtable.declare_variable( - comp_group, - _weight, - GpuKernelArgumentInfo(weight_type), - "weight"); - - if(_bias != nullptr && _bias->has_valid_id()) // optional bias + const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight"); + + if (_bias != nullptr && _bias->has_valid_id()) // optional bias { - vtable.declare_variable( - comp_group, - _bias, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), - "bias"); + vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias"); } - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } -TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { TagLUT lut{}; @@ -243,7 +230,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab lut["src"] = vtable.get_variable(_src); lut["weight"] = vtable.get_variable(_weight); - if(_bias != nullptr && _bias->has_valid_id()) // optional bias + if (_bias != nullptr && _bias->has_valid_id()) // optional bias { lut["bias"] = vtable.get_variable(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); @@ -259,7 +246,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab lut["SRC_DATA_TYPE"] = _src->data_type(); lut["WEI_DATA_TYPE"] = _weight->data_type(); - switch(vtable.get_variable(_src).kernel_argument_info.type) + switch (vtable.get_variable(_src).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: @@ -271,7 +258,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab break; } - switch(vtable.get_variable(_weight).kernel_argument_info.type) + switch (vtable.get_variable(_weight).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: @@ -318,7 +305,7 @@ CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup CLBuildOptions build_opts{}; - if(_settings.fast_relaxed_math()) + if (_settings.fast_relaxed_math()) { build_opts.add_option("-cl-fast-relaxed-math"); } @@ -361,7 +348,7 @@ std::string ClTemplateDepthwiseConv2d::get_config_id() const std::set ClTemplateDepthwiseConv2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateDepthwiseConv2d::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h index 84b689ef64..5d04c687c3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 3322487910..f6a7a58d1d 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplateDirectConv2d.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" - #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -43,17 +42,17 @@ ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _weight{}, _bias{}, _dst{}, - _attributes{ attributes }, - _settings{ settings } + _attributes{attributes}, + _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); - if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) { _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); } @@ -79,7 +78,7 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com // IN_0(src) {{src}} // IN_1(wei) {{weight}} )_"; - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( // IN_1(bia) {{bias}} @@ -161,7 +160,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); } )_"; - if(leftover_loop) + if (leftover_loop) { code += R"_( for(; ck < _ISRC_CHANNELS; ++ck) @@ -186,9 +185,9 @@ TILE(uint, M0, 1, g_dst_indirect_y); T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); } )_"; -} + } -code += R"_( + code += R"_( #undef _I_WEI_WIDTH #undef _I_WEI_HEIGHT #undef _ISRC_WIDTH @@ -202,7 +201,7 @@ code += R"_( } )_"; - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); @@ -211,9 +210,9 @@ code += R"_( T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); )_"; -} + } -code += R"_( + code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1); @@ -227,32 +226,19 @@ code += R"_( void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - vtable.declare_variable( - comp_group, - _weight, - GpuKernelArgumentInfo(weight_type), - "weight"); - - if(_bias && _bias->has_valid_id()) // optional bias + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight"); + + if (_bias && _bias->has_valid_id()) // optional bias { - vtable.declare_variable( - comp_group, - _bias, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), - "bias"); + vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias"); } - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -262,7 +248,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, lut["src"] = vtable.get_variable(_src); lut["weight"] = vtable.get_variable(_weight); - if(_bias && _bias->has_valid_id()) // optional bias + if (_bias && _bias->has_valid_id()) // optional bias { lut["bias"] = vtable.get_variable(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); @@ -279,34 +265,34 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, lut["WEI_DATA_TYPE"] = _weight->data_type(); lut["SRC_TENSOR_TYPE"] = "BUFFER"; - switch(vtable.get_variable(_weight).kernel_argument_info.type) + switch (vtable.get_variable(_weight).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: - { - lut["WEI_TENSOR_TYPE"] = "IMAGE"; - break; - } + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } default: - { - lut["WEI_TENSOR_TYPE"] = "BUFFER"; - break; - } + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } } - const auto width_idx = 1; - const auto height_idx = 2; + const auto width_idx = 1; + const auto height_idx = 2; const auto channel_idx = 0; - lut["SRC_WIDTH"] = _src->dimension(width_idx); - lut["SRC_HEIGHT"] = _src->dimension(height_idx); + lut["SRC_WIDTH"] = _src->dimension(width_idx); + lut["SRC_HEIGHT"] = _src->dimension(height_idx); lut["SRC_CHANNELS"] = _src->dimension(channel_idx); - lut["WEI_WIDTH"] = _weight->dimension(width_idx); - lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); - lut["DST_WIDTH"] = _dst->dimension(width_idx); - lut["DST_HEIGHT"] = _dst->dimension(height_idx); + lut["DST_WIDTH"] = _dst->dimension(width_idx); + lut["DST_HEIGHT"] = _dst->dimension(height_idx); lut["DST_CHANNELS"] = _dst->dimension(channel_idx); lut["STRIDE_X"] = _attributes.stride().x(); @@ -324,14 +310,14 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c { const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); - const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); - const unsigned int n0 = root_window.x().step(); - const unsigned int m0 = root_window.y().step(); - const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); const unsigned int partial_store_n0 = _dst->dimension(0) % n0; CLBuildOptions build_opts{}; - if(_settings.fast_relaxed_math()) + if (_settings.fast_relaxed_math()) { build_opts.add_option("-cl-fast-relaxed-math"); } @@ -379,7 +365,7 @@ std::string ClTemplateDirectConv2d::get_config_id() const std::set ClTemplateDirectConv2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateDirectConv2d::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h index 8988d3ca1c..03c8cd2f15 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index c0481ae190..78bff3c3f3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplateElementwiseBinary.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" - #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -44,11 +43,7 @@ constexpr unsigned int vector_size_byte_opencl = 16; ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _lhs{}, - _rhs{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -69,67 +64,67 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup const bool is_rhs_input = comp_group.is_input_tensor(_rhs); code = -R"_( + R"_( //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} --------------------- )_"; - if(is_root) + if (is_root) { code += -R"_( + R"_( TILE(uint, M0, 1, g_dst_indirect_y); )_"; } - if(is_lhs_input) + if (is_lhs_input) { code += -R"_( + R"_( TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}}); )_"; } - if(is_rhs_input) + if (is_rhs_input) { code += -R"_( + R"_( TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}}); )_"; } code += -R"_( + R"_( { )_"; - if(is_lhs_input) + if (is_lhs_input) { code += -R"_( + R"_( {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}}); )_"; } - if(is_rhs_input) + if (is_rhs_input) { code += -R"_( + R"_( {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}}); )_"; } code += -R"_( + R"_( T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); )_"; - if(is_root) + if (is_root) { // Calculate the destination indirect Y code += -R"_( + R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); @@ -139,7 +134,7 @@ R"_( } code += -R"_( + R"_( } //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} --------------------- )_"; @@ -147,28 +142,18 @@ R"_( return code; } -void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _lhs, - GpuKernelArgumentInfo(common_tensor_type), - "lhs"); - - vtable.declare_variable( - comp_group, - _rhs, - GpuKernelArgumentInfo(common_tensor_type), - "rhs"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs"); + + vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } -TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { TagLUT lut{}; @@ -182,7 +167,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["dst"] = vtable.get_variable(_dst); lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor()); - switch(_attributes.operation()) + switch (_attributes.operation()) { case Attributes::ElementwiseOp::Add: lut["ELTWISE_OP"] = "ADD"; @@ -197,10 +182,10 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); } - ARM_COMPUTE_ERROR_ON( - comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); - ARM_COMPUTE_ERROR_ON( - comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) && + detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) && + detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); // Set broadcast parameters // PRE: All tensors are broadcast-compatible @@ -228,9 +213,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["rhs_m0"] = (rhs_broadcast_yz) ? "1" : "M0"; lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1"; - lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : - (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : - ""; + lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : ""; return lut; } @@ -268,7 +251,7 @@ std::string ClTemplateElementwiseBinary::get_config_id() const std::set ClTemplateElementwiseBinary::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateElementwiseBinary::get_window() const @@ -279,8 +262,9 @@ Window ClTemplateElementwiseBinary::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h index 8cca954efe..991c0eca44 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -48,9 +49,7 @@ public: * @param[in] tensors Tensor arguments to the components * @param[in] attributes Component attributes */ - ClTemplateElementwiseBinary(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); /** Prevent instances of this class from being copy constructed */ ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp index a8d8d32b12..522c33a022 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -38,16 +39,12 @@ namespace dynamic_fusion { namespace { - constexpr unsigned int serial_vector_size = 8; +constexpr unsigned int serial_vector_size = 8; } // namespace ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _sum{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -79,7 +76,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component const bool beta_defined = (_attributes.beta() != 1.f); - if(beta_defined) + if (beta_defined) { code += R"_( VEC_TYPE beta = (VEC_TYPE){{BETA}}; @@ -91,7 +88,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size); const bool non_multiple_of_n0 = ((reduction_dim_size % vector_size) != 0); - if(non_multiple_of_n0) + if (non_multiple_of_n0) { code += R"_( VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr); @@ -111,19 +108,19 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component VEC_TYPE sum1D = 0; )_"; - if(non_multiple_of_n0) + if (non_multiple_of_n0) { code += R"_( data -= max_val; )_"; - if(beta_defined) + if (beta_defined) { code += R"_( data *= beta; )_"; } - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( VSTORE_PARTIAL(N0, PARTIAL_N0) @@ -153,14 +150,14 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component data -= max_val; )_"; - if(beta_defined) + if (beta_defined) { code += R"_( data *= beta; )_"; } - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( VSTORE(N0) @@ -191,28 +188,18 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component return code; } -void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "src"); - - vtable.declare_variable( - comp_group, - _sum, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "sum"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src"); + + vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst"); } -TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); @@ -241,8 +228,8 @@ CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const Compone ARM_COMPUTE_UNUSED(comp_group); CLBuildOptions build_opts{}; - const unsigned int reduction_dim_size = _src->dimension(0); - const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size); + const unsigned int reduction_dim_size = _src->dimension(0); + const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size); build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size))); @@ -264,7 +251,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const std::set ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateLogits1DMaxShiftExpSum::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h index 5d232c0cf2..ac9ddaa9d4 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h @@ -46,7 +46,9 @@ public: * @param[in] tensors Tensor arguments to the components * @param[in] attributes Component attributes */ - ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); + ClTemplateLogits1DMaxShiftExpSum(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes); /** Prevent instances of this class from being copy constructed */ ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp index 056e570a25..7d7c3e6673 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp @@ -25,6 +25,7 @@ #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -38,11 +39,7 @@ namespace dynamic_fusion ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _sum{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -76,7 +73,7 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr); )_"; - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( sum_val = log(sum_val); @@ -101,23 +98,11 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "src"); - - vtable.declare_variable( - comp_group, - _sum, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "sum"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src"); + + vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst"); } TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -168,14 +153,14 @@ std::string ClTemplateLogits1DNorm::get_config_id() const std::set ClTemplateLogits1DNorm::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateLogits1DNorm::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); constexpr unsigned int serial_vector_size = 16; - const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0)); + const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0)); Window win = calculate_max_window(*_src, Steps(vector_size)); return win.collapse(win, Window::DimZ); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp index 34840c2100..ebb0374501 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplatePool2d.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -50,11 +49,7 @@ ClTemplatePool2d::ClTemplatePool2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -71,7 +66,7 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou ARM_COMPUTE_UNUSED(comp_group); // Condition to use 2x2 optimized kernel - if(_attributes.pool_size() == Size2D(2, 2)) + if (_attributes.pool_size() == Size2D(2, 2)) { return get_2x2_kernel_code(); } @@ -83,11 +78,13 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou std::string ClTemplatePool2d::get_MxN_kernel_code() const { - const auto pool_type = _attributes.pool_type(); - const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; + const auto pool_type = _attributes.pool_type(); + const bool fp_mixed_precision = + (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; // Define pool op macro. - std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; + std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" + : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; // Kernel start // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0 @@ -129,7 +126,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const )_"; // Determine filter size depending on if padding is excluded or not - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { code += R"_( const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); @@ -144,7 +141,8 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const // Loop through pool size // if global pooling - if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx)) + if (_attributes.pool_size().x() == _src->dimension(width_idx) && + _attributes.pool_size().y() == _src->dimension(height_idx)) { // Begin loop code += R"_( @@ -173,7 +171,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const // if condition inside loop - use 32bit acc if mixed_precision. // End loop through pooling section. - if(fp_mixed_precision) + if (fp_mixed_precision) { // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE code += R"_( @@ -194,7 +192,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const } // For Pool AVG ONLY, divide pool output by filter size - if(pool_type == PoolingType::AVG) + if (pool_type == PoolingType::AVG) { code += R"_( res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size; @@ -202,7 +200,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const } // If mixed precision convert datatype before storing. Then end kernel. - if(fp_mixed_precision) + if (fp_mixed_precision) { code += R"_( VEC_DATA_TYPE({{DATA_TYPE}}, N0) @@ -228,9 +226,11 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const std::string ClTemplatePool2d::get_2x2_kernel_code() const { - const auto pool_type = _attributes.pool_type(); - const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; - std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; + const auto pool_type = _attributes.pool_type(); + const bool fp_mixed_precision = + (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; + std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" + : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; std::string code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- @@ -274,7 +274,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0); )_"; - if(fp_mixed_precision) + if (fp_mixed_precision) { // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE code += R"_( @@ -294,7 +294,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const )_"; } - if(pool_type != PoolingType::MAX) + if (pool_type != PoolingType::MAX) { // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound) code += R"_( @@ -321,10 +321,10 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const res0 = POOL_OP(res0, data3); )_"; - if(pool_type == PoolingType::AVG) + if (pool_type == PoolingType::AVG) { // If avg pooling divide result accordingly. - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { code += R"_( res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size; @@ -339,7 +339,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const } // Store result - if(fp_mixed_precision) + if (fp_mixed_precision) { code += R"_( VEC_DATA_TYPE({{DATA_TYPE}}, N0) @@ -365,17 +365,11 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -391,12 +385,15 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["meta_kernel_id"] = id(); // Retrieve relevant data - const auto padding = _attributes.pad(); - const auto stride = _attributes.stride(); - const auto pool_size = _attributes.pool_size(); - const auto data_type = _src->data_type(); - const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const std::string max_initial_value = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits::lowest()); + const auto padding = _attributes.pad(); + const auto stride = _attributes.stride(); + const auto pool_size = _attributes.pool_size(); + const auto data_type = _src->data_type(); + const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && + _attributes.pool_type() != PoolingType::MAX; + const std::string max_initial_value = + _settings.use_inf_as_limit() ? "(-INFINITY)" + : float_to_string_with_full_precision(std::numeric_limits::lowest()); // pool specific lut["STRIDE_X"] = stride.x(); @@ -407,7 +404,8 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["POOL_SIZE_Y"] = pool_size.height; // Datatypes and variables - lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use. + lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type( + (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use. lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type); lut["SRC_WIDTH"] = _src->dimension(width_idx); lut["SRC_HEIGHT"] = _src->dimension(height_idx); @@ -454,14 +452,14 @@ std::string ClTemplatePool2d::get_config_id() const std::set ClTemplatePool2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h", "repeat.h" }; + return std::set{"helpers.h", "tile_helpers.h", "repeat.h"}; } Window ClTemplatePool2d::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); const auto output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h index ef1c100f44..d1d3c01669 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp index 8b50f1e209..c882353fcb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -36,11 +37,8 @@ namespace dynamic_fusion { constexpr unsigned int vector_size_byte_opencl = 16; -ClTemplateReshape::ClTemplateReshape(ComponentId id, - const ArgumentPack &tensors) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{} +ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack &tensors) + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -97,23 +95,17 @@ TILE(uint, M0, 1, g_dst_indirect_y); void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _src, + GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - TagLUT lut{}; + TagLUT lut{}; // Arguments and global shared variables lut["src"] = vtable.get_variable(_src); @@ -153,7 +145,7 @@ std::string ClTemplateReshape::get_config_id() const std::set ClTemplateReshape::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateReshape::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h index 56b6585b61..838a21db6d 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -42,8 +43,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - ClTemplateReshape(ComponentId id, - const ArgumentPack &tensors); + ClTemplateReshape(ComponentId id, const ArgumentPack &tensors); /** Prevent instances of this class from being copy constructed */ ClTemplateReshape(const ClTemplateReshape &reshape) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp index aaed1d990d..846c712ceb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -37,8 +38,10 @@ namespace experimental { namespace dynamic_fusion { -ClTemplateResize::ClTemplateResize(ComponentId id, const ArgumentPack &tensors, const ClTemplateResize::Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes } +ClTemplateResize::ClTemplateResize(ComponentId id, + const ArgumentPack &tensors, + const ClTemplateResize::Attributes &attributes) + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -63,9 +66,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); const int bout = g_ind_2 / {{arg_dst}}_h; )_"; - if(_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR) + if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR) { - if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) + if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) { code += R"_( float xi_f = (g_ind_1 * {{SCALE_X}}); @@ -80,7 +83,7 @@ TILE(uint, 1, 1, g_dst_indirect_y); )_"; } - if(_attributes.align_corners()) + if (_attributes.align_corners()) { code += R"_( xi_f = round(xi_f); @@ -95,9 +98,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}}); )_"; } - else if(_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR) + else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR) { - if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) + if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) { code += R"_( float xi_f = (g_ind_1 * {{SCALE_X}}); @@ -137,7 +140,7 @@ TILE(uint, 1, 1, g_dst_indirect_y); T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11); )_"; - if(is_data_type_float(_src->data_type())) + if (is_data_type_float(_src->data_type())) { code += R"_( const {{SRC_DATA_TYPE}} a = ({{SRC_DATA_TYPE}})(xi_f - (float)xi); @@ -158,9 +161,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); const float b1 = (1.f - a1); {{dst}}[0].v = CONVERT_SAT( - (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + + (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + - (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + + (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0)); )_"; } @@ -179,22 +182,18 @@ TILE(uint, 1, 1, g_dst_indirect_y); return code; } -void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const +void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, + const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } -TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const +TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, + const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const { TagLUT lut{}; @@ -212,8 +211,10 @@ TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["DST_DATA_TYPE"] = get_cl_type_from_data_type(_dst->data_type()); lut["CONSTANT_VALUE"] = string_from_pixel_value(0, _src->data_type()); - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners()); + const float scale_x = + scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners()); + const float scale_y = + scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners()); lut["SCALE_X"] = float_to_string_with_full_precision(scale_x); lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y); @@ -242,7 +243,8 @@ std::string ClTemplateResize::get_config_id() const std::string config_id{}; config_id += "resize_"; - config_id += (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : ""); + config_id += + (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : ""); config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : ""); config_id += "_"; config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft"); @@ -260,7 +262,7 @@ std::string ClTemplateResize::get_config_id() const std::set ClTemplateResize::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateResize::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index 217214ced3..d0ec91e0a9 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -32,7 +32,7 @@ namespace experimental namespace dynamic_fusion { ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack &tensors) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{} + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -61,16 +61,10 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h index 3f97a82204..b8c82ceadd 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp index eda15f1d95..d3d7c8db83 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -24,6 +24,7 @@ #include "ClTemplateWriter.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -39,11 +40,11 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con std::string replaced_code = ""; bool scanning_pattern = false; std::string pattern_found = ""; - for(size_t i = 0; i < code_template.size() - 1; ++i) + for (size_t i = 0; i < code_template.size() - 1; ++i) { - if(!scanning_pattern) + if (!scanning_pattern) { - if(code_template[i] == '{' && code_template[i + 1] == '{') + if (code_template[i] == '{' && code_template[i + 1] == '{') { i += 1; scanning_pattern = true; @@ -56,7 +57,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con } else { - if(code_template[i] == '}' && code_template[i + 1] == '}') + if (code_template[i] == '}' && code_template[i + 1] == '}') { i += 1; scanning_pattern = false; @@ -76,8 +77,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con ClTemplateWriter::~ClTemplateWriter() { } -ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) - : _components{ components } +ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components} { } std::string ClTemplateWriter::get_name() @@ -91,7 +91,7 @@ std::string ClTemplateWriter::get_code() std::string ClTemplateWriter::get_config_id() { std::string config_id = get_name(); - for(const auto &comp : _components) + for (const auto &comp : _components) { config_id += "--" + comp->template_writer()->get_config_id() + "--"; } @@ -103,7 +103,7 @@ CLBuildOptions ClTemplateWriter::get_build_options() { CLBuildOptions build_opts{}; - for(const auto &comp : _components) + for (const auto &comp : _components) { build_opts.add_options(comp->template_writer()->get_build_options(_components).options()); } @@ -122,11 +122,9 @@ std::map ClTemplateWriter::get_tensors() { // Assemble GpuKernelArguments std::map tensors; - for(const auto t : _components.get_argument_tensors()) + for (const auto t : _components.get_argument_tensors()) { - tensors.emplace( - t->id(), - GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info }); + tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info}); } return tensors; } @@ -141,22 +139,24 @@ std::string ClTemplateWriter::write_code() std::vector component_codes{}; // vector because order matters // Pass 1: Declare all kernel variables - for(auto &component : _components) + for (auto &component : _components) { component->template_writer()->declare_variables(_vtable, _components); } // Pass 2: Generate component codes - for(auto &component : _components) + for (auto &component : _components) { const auto component_writer = component->template_writer(); auto curr_headers_list = component_writer->get_headers_list(); auto curr_additional_macros = component_writer->get_additional_macros(); auto curr_component_code = component_writer->get_component_code(_components); - const auto var_lut = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + const auto var_lut = component_writer->get_tag_lut( + _vtable, + _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique component_codes.push_back(replace_tags(curr_component_code, var_lut)); headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); - if(!additional_macros.empty()) // Some components might not have any + if (!additional_macros.empty()) // Some components might not have any { additional_macros.insert(replace_tags(curr_additional_macros, var_lut)); } @@ -165,7 +165,7 @@ std::string ClTemplateWriter::write_code() // Step 3: Assemble the data gathered by traversing the graph into the string "code" std::string code = ""; - for(auto &header : headers_list) + for (auto &header : headers_list) { #if defined(EMBEDDED_KERNELS) code += CLKernelLibrary::get().get_program(header).first; @@ -174,16 +174,14 @@ std::string ClTemplateWriter::write_code() #endif // defined(EMBEDDED_KERNELS) } - for(auto ¯os : additional_macros) + for (auto ¯os : additional_macros) { code += macros; } auto arguments = _components.get_argument_tensors(); - std::sort(arguments.begin(), arguments.end(), [](const ITensorInfo * l, const ITensorInfo * r) - { - return l->id() < r->id(); - }); + std::sort(arguments.begin(), arguments.end(), + [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); }); code += write_kernel_signature(_vtable.get_variable_list(arguments)); code += "\n{\n\n"; @@ -198,7 +196,7 @@ std::string ClTemplateWriter::write_code() tiles_ss << " //------------------ START TILE DECLARATION ---------------------\n"; - for(auto tile : tiles) + for (auto tile : tiles) { const auto var = _vtable.get_variable(tile); const auto data_type = get_cl_type_from_data_type(tile->data_type()); @@ -212,7 +210,7 @@ std::string ClTemplateWriter::write_code() code += tiles_ss.str(); } - for(const auto &component_code : component_codes) + for (const auto &component_code : component_codes) { code += component_code; code += "\n"; @@ -231,7 +229,8 @@ std::string ClTemplateWriter::write_global_section() const auto leftover_w = dst_w % tile_w; std::string code = ""; - code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + + std::to_string(leftover_w) + ");\n"; code += std::string(" int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n"; code += std::string(" int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n"); @@ -243,7 +242,7 @@ std::string ClTemplateWriter::write_global_section() const std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const { std::string code; - switch(var.kernel_argument_info.type) + switch (var.kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Vector: { @@ -293,11 +292,11 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl { std::string code = "\n__kernel void " + write_kernel_name() + "("; - for(int i = 0; i < static_cast(argument_list.size()) - 1; ++i) + for (int i = 0; i < static_cast(argument_list.size()) - 1; ++i) { code += write_argument_declaration(argument_list[i]) + ","; } - if(static_cast(argument_list.size()) - 1 >= 0) + if (static_cast(argument_list.size()) - 1 >= 0) { code += write_argument_declaration(argument_list[argument_list.size() - 1]); } @@ -308,12 +307,12 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl } std::string ClTemplateWriter::write_kernel_name() const { - if(_components.empty()) + if (_components.empty()) { return "empty_kernel"; } std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name(); - for(size_t i = 1; i < _components.size(); ++i) + for (size_t i = 1; i < _components.size(); ++i) { name += "___"; name += _components[i]->template_writer()->get_name(); diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h index c891e76d8b..c157c2b21c 100644 --- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h +++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH #include "arm_compute/core/Error.h" + #include #include #include @@ -68,12 +69,10 @@ public: OperatorId op{}; std::vector inputs{}; std::vector outputs{}; - friend bool operator==(const OpPack &opp0, const OpPack &opp1) + friend bool operator==(const OpPack &opp0, const OpPack &opp1) { - return std::make_tuple( - opp0.op, opp0.inputs, opp0.outputs) - == std::make_tuple( - opp1.op, opp1.inputs, opp1.outputs); + return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) == + std::make_tuple(opp1.op, opp1.inputs, opp1.outputs); } }; @@ -95,10 +94,13 @@ public: * @return true If the operator can be added while keeping the graph as a linear sequence * @return false Otherwise */ - bool try_add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) const + bool try_add_operator_as_linear(OperatorId op, + const std::vector &inputs, + const std::vector &outputs, + bool is_output = false) const { ARM_COMPUTE_UNUSED(op, is_output); - if(all_ops().empty()) + if (all_ops().empty()) { return true; } @@ -106,25 +108,25 @@ public: // If the new operator is not the first operator, at least one input tensor must be // the output tensor of the last non-output operator. All other input tensors must be // the global input of the graph (i.e. not the output of any operator). - if(_last_op_available) + if (_last_op_available) { auto use_input_from_last_op = false; - for(auto src_tensor : inputs) + for (auto src_tensor : inputs) { const auto src_ops = _adj_src_ops.find(src_tensor); - if(src_ops != _adj_src_ops.end()) + if (src_ops != _adj_src_ops.end()) { ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1); - if(!src_ops->second.empty()) + if (!src_ops->second.empty()) { const auto src_op = src_ops->second[0]; - if(src_op == _last_op) + if (src_op == _last_op) { - if(use_input_from_last_op) + if (use_input_from_last_op) { // To be safe, we also forbid using the output tensor // of the last operator twice. @@ -143,7 +145,7 @@ public: } } - if(!use_input_from_last_op) + if (!use_input_from_last_op) { // At least one input tensor must be the output tensor of the last non-output operator. return false; @@ -152,9 +154,9 @@ public: // The output tensor of the new operator must not be the input tensor of any previously // added operator. - for(auto dst_tensor : outputs) + for (auto dst_tensor : outputs) { - if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) + if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end()) { return false; } @@ -168,7 +170,10 @@ public: * INVARIANT: The list can only grow from head to tail * INVARIANT: POSTCONDITION: The graph is linear */ - void add_operator_as_linear(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) + void add_operator_as_linear(OperatorId op, + const std::vector &inputs, + const std::vector &outputs, + bool is_output = false) { const auto success = add_operator(op, inputs, outputs, is_output); ARM_COMPUTE_UNUSED(success); @@ -183,24 +188,27 @@ public: * @param[in] outputs Output tensors to the operator * @param[in] is_output Whether this is an output operator */ - bool add_operator(OperatorId op, const std::vector &inputs, const std::vector &outputs, bool is_output = false) + bool add_operator(OperatorId op, + const std::vector &inputs, + const std::vector &outputs, + bool is_output = false) { - if(operator_exists(op)) + if (operator_exists(op)) { return false; } _adj_src_tensors[op] = {}; _adj_dst_tensors[op] = {}; - for(auto in_tensor : inputs) + for (auto in_tensor : inputs) { // Linking input tensor to operator node will never create a cycle / loop because we guarantee // each op is newly created, so every pair / edge is new link_input(op, in_tensor); } - for(auto out_tensor : outputs) + for (auto out_tensor : outputs) { // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle - if(path_exists_from_tensor_to_op(out_tensor, op)) + if (path_exists_from_tensor_to_op(out_tensor, op)) { remove_operator(op); return false; @@ -211,10 +219,10 @@ public: } } - if(!is_output) + if (!is_output) { _last_op_available = true; - _last_op = op; + _last_op = op; } return true; @@ -230,16 +238,16 @@ public: std::vector build_operators_sequence() const { std::vector ops_seq; - std::set done_ops; - std::set done_tensors; + std::set done_ops; + std::set done_tensors; const auto input_tensors = global_src_tensors(); - for(auto tensor : input_tensors) + for (auto tensor : input_tensors) { done_tensors.insert(tensor); - for(auto op : _adj_dst_ops.at(tensor)) + for (auto op : _adj_dst_ops.at(tensor)) { build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors); } @@ -260,10 +268,8 @@ public: friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1) { // Do not compare id allocators - return std::make_tuple( - g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) - == std::make_tuple( - g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops); + return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) == + std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops); } std::vector src_ops_from_tensor(TensorId tensor) const { @@ -280,10 +286,8 @@ public: std::vector all_tensors() const { std::vector tensors{}; - std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it) - { - return it.first; - }); + std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), + [](const auto &it) { return it.first; }); return tensors; } /** Get source tensors of the whole graph @@ -293,9 +297,9 @@ public: std::vector global_src_tensors() const { std::vector tensors; - for(auto tensor_src_ops : _adj_src_ops) + for (auto tensor_src_ops : _adj_src_ops) { - if(tensor_src_ops.second.empty()) + if (tensor_src_ops.second.empty()) { tensors.push_back(tensor_src_ops.first); } @@ -309,9 +313,9 @@ public: std::vector global_dst_tensors() const { std::vector tensors; - for(auto tensor_dst_ops : _adj_dst_ops) + for (auto tensor_dst_ops : _adj_dst_ops) { - if(tensor_dst_ops.second.empty()) + if (tensor_dst_ops.second.empty()) { tensors.push_back(tensor_dst_ops.first); } @@ -328,14 +332,14 @@ public: // If a tensor is used to connect the input of an operator and the output of another operator, // it is not allocated in the memory. The tensor exists as a temporary variable only. - for(auto src_tensor : _adj_src_ops) + for (auto src_tensor : _adj_src_ops) { - if(!src_tensor.second.empty()) + if (!src_tensor.second.empty()) { const auto dst_tensor = _adj_dst_ops.find(src_tensor.first); - if(dst_tensor != _adj_dst_ops.end()) + if (dst_tensor != _adj_dst_ops.end()) { - if(!dst_tensor->second.empty()) + if (!dst_tensor->second.empty()) { tensors.push_back(src_tensor.first); } @@ -354,9 +358,9 @@ public: std::vector ops{}; const auto op_list = all_ops(); - for(auto op : op_list) + for (auto op : op_list) { - if(src_ops(op).empty()) + if (src_ops(op).empty()) { ops.emplace_back(op); } @@ -368,7 +372,7 @@ private: void link_input(OperatorId op, TensorId in_tensor) { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - if(!tensor_exists(in_tensor)) + if (!tensor_exists(in_tensor)) { insert_new_tensor(in_tensor); } @@ -379,7 +383,7 @@ private: void link_output(OperatorId op, TensorId out_tensor) { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); - if(!tensor_exists(out_tensor)) + if (!tensor_exists(out_tensor)) { insert_new_tensor(out_tensor); } @@ -392,7 +396,7 @@ private: { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); std::vector ops{}; - for(TensorId src_tensor : src_tensors(op)) + for (TensorId src_tensor : src_tensors(op)) { ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor))); } @@ -402,7 +406,7 @@ private: { ARM_COMPUTE_ERROR_ON(!operator_exists(op)); std::vector ops{}; - for(TensorId dst_tensor : _adj_dst_tensors.at(op)) + for (TensorId dst_tensor : _adj_dst_tensors.at(op)) { ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor))); } @@ -436,10 +440,8 @@ private: std::vector all_ops() const { std::vector ops{}; - std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it) - { - return it.first; - }); + std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), + [](const auto &it) { return it.first; }); return ops; } /** Remove an operator from graph. @@ -448,25 +450,21 @@ private: */ void remove_operator(OperatorId op) { - for(auto src_tensor : _adj_src_tensors.at(op)) + for (auto src_tensor : _adj_src_tensors.at(op)) { auto &dst_ops = _adj_dst_ops.at(src_tensor); - dst_ops.erase( - std::remove(std::begin(dst_ops), std::end(dst_ops), op), - std::end(dst_ops)); + dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops)); } - for(auto dst_tensor : _adj_dst_tensors.at(op)) + for (auto dst_tensor : _adj_dst_tensors.at(op)) { auto &src_ops = _adj_src_ops.at(dst_tensor); - src_ops.erase( - std::remove(std::begin(src_ops), std::end(src_ops), op), - std::end(src_ops)); + src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops)); } // Remove any isolated tensors // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty - for(auto t : all_tensors()) + for (auto t : all_tensors()) { - if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty()) + if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty()) { _adj_src_ops.erase(t); _adj_dst_ops.erase(t); @@ -486,11 +484,12 @@ private: } bool operator_exists(OperatorId op) const { - return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); + return _adj_src_tensors.find(op) != _adj_src_tensors.end() && + _adj_dst_tensors.find(op) != _adj_dst_tensors.end(); } bool is_src_tensor_of(OperatorId op, TensorId tensor) const { - if(!operator_exists(op) || !tensor_exists(tensor)) + if (!operator_exists(op) || !tensor_exists(tensor)) { return false; } @@ -499,7 +498,7 @@ private: } bool is_dst_tensor_of(OperatorId op, TensorId tensor) const { - if(!operator_exists(op) || !tensor_exists(tensor)) + if (!operator_exists(op) || !tensor_exists(tensor)) { return false; } @@ -525,9 +524,9 @@ private: std::vector ops{}; const auto op_list = all_ops(); - for(auto op : op_list) + for (auto op : op_list) { - if(is_dst_op(op)) + if (is_dst_op(op)) { ops.emplace_back(op); } @@ -536,13 +535,13 @@ private: } bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const { - if(!tensor_exists(src_tensor) || !operator_exists(dst_op)) + if (!tensor_exists(src_tensor) || !operator_exists(dst_op)) { return false; } - for(auto child_op : dst_ops_from_tensor(src_tensor)) + for (auto child_op : dst_ops_from_tensor(src_tensor)) { - if(path_exists_from_op_to_op(child_op, dst_op)) + if (path_exists_from_op_to_op(child_op, dst_op)) { return true; } @@ -552,21 +551,21 @@ private: bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const { - if(!operator_exists(src_op) || !operator_exists(dst_op)) + if (!operator_exists(src_op) || !operator_exists(dst_op)) { return false; } - if(src_op == dst_op) + if (src_op == dst_op) { return true; } - if(is_in(src_op, get_dst_ops())) + if (is_in(src_op, get_dst_ops())) { return false; } - for(auto child_tensor : dst_tensors(src_op)) + for (auto child_tensor : dst_tensors(src_op)) { - if(path_exists_from_tensor_to_op(child_tensor, dst_op)) + if (path_exists_from_tensor_to_op(child_tensor, dst_op)) { return true; } @@ -574,16 +573,15 @@ private: return false; } - void build_operators_sequence_from_op( - Id op, - std::vector &ops_seq, - std::set &done_ops, - std::set &done_tensors) const + void build_operators_sequence_from_op(Id op, + std::vector &ops_seq, + std::set &done_ops, + std::set &done_tensors) const { - while(true) + while (true) { // If the operator has been added to the sequence, ignore it. - if(done_ops.find(op) != done_ops.end()) + if (done_ops.find(op) != done_ops.end()) { return; } @@ -593,9 +591,9 @@ private: // is added to the sequence. const auto src_tensors = _adj_src_tensors.at(op); - for(auto src : src_tensors) + for (auto src : src_tensors) { - if(done_tensors.find(src) == done_tensors.end()) + if (done_tensors.find(src) == done_tensors.end()) { return; } @@ -606,24 +604,24 @@ private: done_ops.insert(op); - OpPack pack{ op, src_tensors, dst_tensors }; + OpPack pack{op, src_tensors, dst_tensors}; ops_seq.push_back(pack); done_tensors.insert(dst_tensors.begin(), dst_tensors.end()); // Visit all the sink operators. // Call this function recursively unless there is only one sink. - if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) + if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1) { op = _adj_dst_ops.at(dst_tensors[0])[0]; } else { - for(auto dst_tensor : dst_tensors) + for (auto dst_tensor : dst_tensors) { const auto dst_ops = _adj_dst_ops.at(dst_tensor); - for(auto dst_op : dst_ops) + for (auto dst_op : dst_ops) { build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors); } @@ -640,8 +638,8 @@ private: AdjList _adj_src_ops{}; AdjList _adj_dst_ops{}; - bool _last_op_available{ false }; - OperatorId _last_op{ 0 }; + bool _last_op_available{false}; + OperatorId _last_op{0}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h index c9fc2c610f..3f4a2edd03 100644 --- a/src/dynamic_fusion/utils/Utils.h +++ b/src/dynamic_fusion/utils/Utils.h @@ -63,17 +63,21 @@ inline bool is_invalid_tensor(const ITensorInfo *tensor_info) /** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo */ -inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, bool mixed_precision = false, DataLayout data_layout = DataLayout::NHWC) +inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, + bool mixed_precision = false, + DataLayout data_layout = DataLayout::NHWC) { // Create PadStrideInfo const Size2D stride = pool_attr.stride(); const Padding2D padding = pool_attr.pad(); - const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, arm_compute::DimensionRoundingType::FLOOR); + const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, + arm_compute::DimensionRoundingType::FLOOR); - return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, pool_attr.exclude_padding(), mixed_precision); -} -} -} + return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, + pool_attr.exclude_padding(), mixed_precision); } +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute #endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */ -- cgit v1.2.1