From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- .../gpu/ckw_driver/GpuCkwComponentArgument.cpp | 7 +- .../gpu/ckw_driver/GpuCkwComponentArgument.h | 6 +- .../sketch/gpu/ckw_driver/GpuCkwDriver.cpp | 21 ++- .../sketch/gpu/ckw_driver/GpuCkwDriver.h | 4 +- .../sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp | 12 +- .../gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp | 1 + .../gpu/ckw_driver/GpuCkwScopedKernelWriter.h | 2 +- .../sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp | 20 ++- .../sketch/gpu/ckw_driver/GpuCkwVariableTable.h | 8 +- .../sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h | 10 +- .../gpu/ckw_driver/components/GpuCkwActivation.cpp | 34 ++-- .../gpu/ckw_driver/components/GpuCkwActivation.h | 10 +- .../gpu/ckw_driver/components/GpuCkwCast.cpp | 44 +++--- .../sketch/gpu/ckw_driver/components/GpuCkwCast.h | 10 +- .../ckw_driver/components/GpuCkwDirectConv2d.cpp | 49 +++--- .../components/GpuCkwElementwiseBinary.cpp | 84 +++++----- .../components/GpuCkwElementwiseBinary.h | 14 +- .../gpu/ckw_driver/components/GpuCkwPool2d.cpp | 171 +++++++++++---------- .../gpu/ckw_driver/components/GpuCkwPool2d.h | 8 +- .../gpu/ckw_driver/components/GpuCkwResize.cpp | 76 +++++---- .../gpu/ckw_driver/components/GpuCkwStore.cpp | 10 +- .../sketch/gpu/ckw_driver/components/GpuCkwStore.h | 6 +- .../gpu/ckw_driver/components/utils/WriterHelper.h | 31 +++- .../components/utils/type_converter/Common.h | 35 ++--- .../utils/type_converter/ElementwiseBinary.h | 3 +- 25 files changed, 373 insertions(+), 303 deletions(-) (limited to 'src/dynamic_fusion/sketch/gpu/ckw_driver') diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp index 4b4c22fa1d..c4ab110c92 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp @@ -23,6 +23,7 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + #include "ckw/Error.h" namespace arm_compute @@ -36,12 +37,12 @@ GpuCkwComponentArgument::GpuCkwComponentArgument() { } -GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) - : _tensor(&tensor) +GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor) { } -GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler) +GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, + const ckw::TensorTileSampler &tile_sampler) { CKW_ASSERT(_tile == nullptr); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h index 80f91389a0..863989a7bd 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h @@ -110,9 +110,9 @@ public: const ckw::TensorTileSampler &tile_sampler() const; private: - ckw::TensorOperand *_tensor{ nullptr }; - ckw::TileOperand *_tile{ nullptr }; - ckw::TensorTileSampler _tile_sampler{}; + ckw::TensorOperand *_tensor{nullptr}; + ckw::TileOperand *_tile{nullptr}; + ckw::TensorTileSampler _tile_sampler{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp index a24a172d77..c927f32bde 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp @@ -23,17 +23,16 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" -#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Window.h" + #include "src/common/utils/Log.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" - #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" using namespace ckw; namespace arm_compute @@ -43,11 +42,11 @@ namespace experimental namespace dynamic_fusion { GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) - : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }, _code{} + : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{} { // Generate kernel name std::string name = ""; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -60,7 +59,7 @@ GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components) GpuCkwScopedKernelWriter writer(&root_writer); GpuCkwVariableTable vtable{}; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -82,7 +81,7 @@ std::string GpuCkwDriver::get_code() std::string GpuCkwDriver::get_config_id() { std::string id = ""; - for(auto &comp : _components) + for (auto &comp : _components) { auto ckw_driver = comp->ckw_component_driver(); ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr); @@ -101,9 +100,9 @@ Window GpuCkwDriver::get_window() const GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments() { GpuKernelArgumentList args{}; - for(const auto &arg : _kernel.arguments()) + for (const auto &arg : _kernel.arguments()) { - switch(arg.type()) + switch (arg.type()) { case KernelArgument::Type::TensorStorage: { diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h index 19db575fea..2ca5fb435c 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER +#include "ckw/Kernel.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h" -#include "ckw/Kernel.h" - #include #include diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp index ca4f121566..5f8ce919e3 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp @@ -23,10 +23,12 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + #include "ckw/Error.h" #include "ckw/TileInfo.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" + namespace arm_compute { namespace experimental @@ -34,21 +36,21 @@ namespace experimental namespace dynamic_fusion { -GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) - : KernelWriter(kernel) +GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel) { } void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler) { - if(!tensor_or_tile->has_tile()) + if (!tensor_or_tile->has_tile()) { CKW_ASSERT(tensor_or_tile->has_tensor()); auto &tensor = tensor_or_tile->tensor(); const auto tile_name = tensor.name() + "_tile"; - auto &tile = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width())); + auto &tile = + declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width())); op_load(tile, tensor, sampler); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp index 043fda9e6f..cbadbd9639 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp @@ -23,6 +23,7 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h index 4d11b5e3e4..81049bfe37 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h @@ -63,7 +63,7 @@ public: private: GpuCkwKernelWriter *_writer; - int32_t _parent_id_space; + int32_t _parent_id_space; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp index 37c27cd116..88a0cf7f43 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp @@ -23,11 +23,12 @@ */ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include namespace arm_compute @@ -36,19 +37,22 @@ namespace experimental { namespace dynamic_fusion { -GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage, - const std::string &alias) +GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + TensorStorageType storage, + const std::string &alias) { ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected"); // Do not re-declare if the variable associated with the tensor has already been declared auto it = _vars.find(tensor->id()); - if(it != _vars.end()) + if (it != _vars.end()) { return &it->second; } - if(comp_group.is_intermediate_tensor(tensor)) + if (comp_group.is_intermediate_tensor(tensor)) { // Create a virtual tensor variable GpuCkwComponentArgument var; @@ -61,7 +65,7 @@ GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelCo std::stringstream ss; ss << alias << "_t" << abs(tensor->id()); const auto uniq_name = ss.str(); - GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) }; + GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))}; auto &&inserted = _vars.emplace(tensor->id(), var); return &(inserted.first->second); } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h index 0649dcba9d..2b118911b8 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE #include "arm_compute/core/ITensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" #include @@ -58,8 +59,11 @@ public: * * @return GpuCkwComponentArgument* */ - GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage, - const std::string &alias = "unnamed"); + GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, + GpuCkwScopedKernelWriter &writer, + const ITensorInfo *tensor, + TensorStorageType storage, + const std::string &alias = "unnamed"); private: std::map _vars{}; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h index 14086f785e..52e56e2e35 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER #include "arm_compute/core/Window.h" + #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/components/Types.h" @@ -73,8 +74,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - IGpuCkwComponentDriver(ComponentId id, const ArgumentPack &tensors) - : _id{ id }, _tensors{ tensors } + IGpuCkwComponentDriver(ComponentId id, const ArgumentPack &tensors) : _id{id}, _tensors{tensors} { } /** Destructor */ @@ -89,7 +89,9 @@ public: * * @note @p writer can only be passed via value since the new scope is created in the copy constructor */ - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const = 0; /** Get tensor arguments */ ArgumentPack tensors() const { @@ -128,7 +130,7 @@ public: } private: - ComponentId _id{ -1 }; + ComponentId _id{-1}; ArgumentPack _tensors{}; }; } // namespace dynamic_fusion diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp index c07fac0e0d..c3b1b3c8bc 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp @@ -24,16 +24,18 @@ #include "GpuCkwActivation.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include using namespace ckw; @@ -87,24 +89,25 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ GpuCkwActivation::GpuCkwActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler); @@ -119,7 +122,7 @@ void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, Gp const auto &constant_B = writer->declare_tile("B_VAL", _attributes.b()); // Perform the operation. - switch(_attributes.activation()) + switch (_attributes.activation()) { case ActivationLayerInfo::ActivationFunction::LOGISTIC: { @@ -179,9 +182,10 @@ Window GpuCkwActivation::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h index e157e36cbf..386e933a72 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h @@ -46,15 +46,15 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwActivation(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation); /** Destructor */ ~GpuCkwActivation() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp index 6ecf2bac44..e8e5087633 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp @@ -24,16 +24,18 @@ #include "GpuCkwCast.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + #include using namespace ckw; @@ -84,30 +86,29 @@ inline TensorTileSampler create_sampler(GpuCkwScopedKernelWriter &writer, int32_ } } // namespace -GpuCkwCast::GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } +GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Load the source tile and prepare the sampler. - if(!src->has_tile()) + if (!src->has_tile()) { const auto sampler = create_sampler(writer, m0, n0); writer->op_load_once(src, sampler); @@ -122,7 +123,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const auto &sampler = src->tile_sampler(); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { // Get Target datatype and convert it to ckw::DataType. ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type()); @@ -143,7 +144,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa const size_t dst_size = data_size_from_type(_dst->data_type()); const bool cast_down = (src_size >= dst_size); - if(cast_down && is_data_type_quantized(_src->data_type())) + if (cast_down && is_data_type_quantized(_src->data_type())) { const auto &constant_x80 = writer->declare_tile("0x80", 0x80); writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80); @@ -151,7 +152,7 @@ void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVa ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None; - if(cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) + if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) { convert_policy = ckw::ConvertPolicy::Saturate; } @@ -167,9 +168,10 @@ Window GpuCkwCast::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h index 821cec1e19..2389301196 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h @@ -46,15 +46,15 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwCast(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast); /** Destructor */ ~GpuCkwCast() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp index 3c906646a6..7833da2334 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp @@ -25,21 +25,20 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" - +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" #include "ckw/TileInfo.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" namespace arm_compute { @@ -54,13 +53,7 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _wei{}, - _bia{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -69,7 +62,9 @@ GpuCkwDirectConv2d::GpuCkwDirectConv2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null } -void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto desc = _settings.direct_conv_descriptor(); ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image, @@ -99,15 +94,18 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // extra loop to compute the left-over elements. const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); GpuCkwComponentArgument *wei = vtable.declare_variable( - comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + comp_group, writer, _wei, + use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); GpuCkwComponentArgument *bia = nullptr; const bool using_bias = _bia != nullptr; - if(using_bias) + if (using_bias) { bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia"); } @@ -154,7 +152,8 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, src_sampler.address_mode_x(TensorSamplerAddressModeX::None); // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the // indirection buffer mi does not contain negative values representing out-of-bounds reads. - src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly); + src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None + : TensorSamplerAddressModeY::SkipMinEdgeOnly); src_sampler.address_mode_z(TensorSamplerAddressModeZ::None); TensorTileSampler wei_sampler; @@ -178,7 +177,7 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, dst_sampler.z(tile_0); dst_sampler.b(tile_bout); - if(!dst->has_tile()) + if (!dst->has_tile()) { auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0)); dst->init_virtual_tensor(tile, dst_sampler); @@ -189,10 +188,10 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // We create a 2d container of size (M0, 1) to store the indices for iteration TileContainer it; - for(int m = 0; m < m0; ++m) + for (int m = 0; m < m0; ++m) { - std::vector idx { std::to_string(m) }; - it.push_back({ idx }); + std::vector idx{std::to_string(m)}; + it.push_back({idx}); } const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32); @@ -289,9 +288,9 @@ void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, // Bias addition // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel. - if(using_bias) + if (using_bias) { - if(!bia->has_tile()) + if (!bia->has_tile()) { // Reuse the destination sampler for the bias writer->op_load_once(bia, dst_sampler); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp index c8bf999261..2935ba45ea 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp @@ -24,22 +24,24 @@ #include "GpuCkwElementwiseBinary.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" #include "ckw/types/TensorSamplerTypes.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h" #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" + #include #include @@ -53,11 +55,7 @@ namespace dynamic_fusion GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _lhs{}, - _rhs{}, - _dst{}, - _attributes{ attributes } + : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -65,15 +63,20 @@ GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); } -void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const auto n0 = static_cast(root_window.x().step()); const auto m0 = static_cast(root_window.y().step()); - GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); - GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *lhs = + vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs"); + GpuCkwComponentArgument *rhs = + vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -86,32 +89,36 @@ void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_gr auto &const_0 = writer->declare_tile("0", 0); // Load the LHS and RHS tiles - if(!lhs->has_tile()) + if (!lhs->has_tile()) { - auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), n0, m0, "lhs_", const_0); + auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), + n0, m0, "lhs_", const_0); sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension sampler.z(const_0); sampler.b(gid_2); writer->op_load_once(lhs, sampler); } - if(!rhs->has_tile()) + if (!rhs->has_tile()) { - auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), n0, m0, "rhs_", const_0); + auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), + n0, m0, "rhs_", const_0); sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension sampler.z(const_0); sampler.b(gid_2); writer->op_load_once(rhs, sampler); } - auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), n0, m0, "dst_", const_0); + auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), + n0, m0, "dst_", const_0); dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension dst_sampler.z(const_0); dst_sampler.b(gid_2); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { - auto &tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width())); + auto &tile = writer->declare_tile( + "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width())); dst->init_virtual_tensor(tile, dst_sampler); } @@ -131,9 +138,10 @@ Window GpuCkwElementwiseBinary::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - constexpr unsigned int vector_size_byte_opencl = 16; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + constexpr unsigned int vector_size_byte_opencl = 16; + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } @@ -141,11 +149,12 @@ Window GpuCkwElementwiseBinary::get_window() const std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - const std::vector build_params = - { + const std::vector build_params = { "elementwise_binary", - "op", to_string(_attributes.operation()), - "dt", lower_string(string_from_data_type(_dst->data_type())), + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), }; return join(build_params, "_"); } @@ -154,13 +163,16 @@ std::string GpuCkwElementwiseBinary::get_tuner_id(const ComponentGroup &comp_gro { ARM_COMPUTE_UNUSED(comp_group); /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles) - std::vector build_params = - { + std::vector build_params = { "elementwise_binary", - "op", to_string(_attributes.operation()), - "dt", lower_string(string_from_data_type(_dst->data_type())), - "dst_dim0", support::cpp11::to_string(_dst->dimension(0)), - "dst_dim1", support::cpp11::to_string(_dst->dimension(1)), + "op", + to_string(_attributes.operation()), + "dt", + lower_string(string_from_data_type(_dst->data_type())), + "dst_dim0", + support::cpp11::to_string(_dst->dimension(0)), + "dst_dim1", + support::cpp11::to_string(_dst->dimension(1)), }; return join(build_params, "_"); } diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h index e9c41530f8..1a20d4c533 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h @@ -46,17 +46,17 @@ public: * @param[in] tensors Tensor arguments to the component * @param[in] attributes Component attributes */ - GpuCkwElementwiseBinary(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary); /** Destructor */ ~GpuCkwElementwiseBinary() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; - std::string get_name(const ComponentGroup &comp_group) const override; - std::string get_tuner_id(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; + std::string get_tuner_id(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_lhs; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp index 9c9a298132..8ab3ec3a55 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp @@ -24,17 +24,18 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" using namespace ckw; @@ -48,11 +49,7 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); @@ -60,14 +57,18 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -90,23 +91,26 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const auto src_data_type = _src->data_type(); // Check if this is global pooling path - const bool is_global_pooling = (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); + const bool is_global_pooling = + (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); // Check if this a case of FP_MIXED_PRECISION - const bool use_fp_mixed_precision = (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); + const bool use_fp_mixed_precision = + (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; + const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); TileOperand &const_0 = writer->declare_tile("0", 0); const TileOperand &const_1 = writer->declare_tile("1", 1); const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits::lowest()); const TileOperand &pool_size_x_tile = writer->declare_tile("POOL_SIZE_X", pool_size_x); const TileOperand &pool_size_y_tile = writer->declare_tile("POOL_SIZE_Y", pool_size_y); - const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast(_attributes.stride().x())); - const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast(_attributes.stride().y())); - const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); - const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); - const TileOperand &dst_height_tile = writer->declare_tile("DST_HEIGHT", static_cast(_dst->dimension(height_idx))); - const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); - const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); + const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast(_attributes.stride().x())); + const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast(_attributes.stride().y())); + const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); + const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); + const TileOperand &dst_height_tile = + writer->declare_tile("DST_HEIGHT", static_cast(_dst->dimension(height_idx))); + const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); + const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32); TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32); @@ -145,7 +149,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw // Prepare dst tensor and tile TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0); - if(!dst->has_tile()) + if (!dst->has_tile()) { TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info); dst->init_virtual_tensor(dst_tile, dst_sampler); @@ -156,14 +160,15 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); // Initialise result tile with appropriate value - if(_attributes.pool_type() == PoolingType::MAX) + if (_attributes.pool_type() == PoolingType::MAX) { - if(_settings.use_inf_as_limit()) + if (_settings.use_inf_as_limit()) { TileContainer minus_inf_tile_container; std::vector value = std::vector(n0, "(-INFINITY)"); - minus_inf_tile_container.push_back({ value }); - const TileOperand &minus_inf = writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); + minus_inf_tile_container.push_back({value}); + const TileOperand &minus_inf = + writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); writer->op_assign(res_tile, minus_inf); } else @@ -209,7 +214,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e); const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32); - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32); const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32); @@ -227,7 +232,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32); const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32); - if(is_global_pooling) + if (is_global_pooling) { writer->op_assign(x, const_0); writer->op_assign(y, const_0); @@ -242,76 +247,80 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw } // Y dim for-loop - writer->op_for_loop(y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, [&]() - { - // Reset the iterator for the inner loop - if(is_global_pooling) - { - writer->op_assign(x, const_0); - } - else + writer->op_for_loop( + y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, + [&]() { - writer->op_assign(x, pool_x_s); - } - - TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); - writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); - - // X dim for-loop - writer->op_for_loop(x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, [&]() - { - TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); - writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); - - TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); - - src_sampler.y(a_x); - src_sampler.z(a_y); - - // Load src tile - if(use_fp_mixed_precision) + // Reset the iterator for the inner loop + if (is_global_pooling) { - TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); - writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); - writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + writer->op_assign(x, const_0); } else { - writer->op_load(src_tile, src->tensor(), src_sampler); + writer->op_assign(x, pool_x_s); } - // Take the square of the input, for L2 Pooling - if(_attributes.pool_type() == PoolingType::L2) - { - writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); - } - - // Perfom Pooling op - if(_attributes.pool_type() == PoolingType::MAX) - { - writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); - } - else - { - writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); - } + TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); + writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); + + // X dim for-loop + writer->op_for_loop( + x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, + [&]() + { + TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); + writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); + + TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); + + src_sampler.y(a_x); + src_sampler.z(a_y); + + // Load src tile + if (use_fp_mixed_precision) + { + TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); + writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); + writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + } + else + { + writer->op_load(src_tile, src->tensor(), src_sampler); + } + + // Take the square of the input, for L2 Pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); + } + + // Perfom Pooling op + if (_attributes.pool_type() == PoolingType::MAX) + { + writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); + } + else + { + writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); + } + }); }); - }); - if((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) + if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) { // filter_size is automatically broadcasted in the operation writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size); } // Take square root of the result in L2 pooling - if(_attributes.pool_type() == PoolingType::L2) + if (_attributes.pool_type() == PoolingType::L2) { writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile); } // Store the results and do casting if FP_MIXED_PRECISION - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None); } @@ -326,7 +335,7 @@ Window GpuCkwPool2d::get_window() const ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); TensorShape output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size. diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h index 2ccf255236..822282a108 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h @@ -59,9 +59,11 @@ public: /** Destructor */ ~GpuCkwPool2d() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - Window get_window() const override; - std::string get_name(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + Window get_window() const override; + std::string get_name(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp index d997c82dae..f2a7d41afd 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp @@ -28,14 +28,13 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/ScaleUtils.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" - +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -49,20 +48,17 @@ namespace constexpr unsigned int opencl_vector_size_in_bytes = 16; } // namespace -GpuCkwResize::GpuCkwResize(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } +GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); @@ -72,12 +68,16 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, const int32_t m0 = root_window.y().step(); const int32_t partial_n0 = _dst->dimension(0) % n0; - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x); const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y); const auto &tile_0 = writer->declare_tile("0", 0); @@ -112,7 +112,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); - switch(_attributes.sampling_policy()) + switch (_attributes.sampling_policy()) { case SamplingPolicy::TOP_LEFT: // xi_f = (xo * scale_x) @@ -138,7 +138,7 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, ARM_COMPUTE_ERROR("Unsupported sampling policy"); } - if(_attributes.align_corners()) + if (_attributes.align_corners()) { writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f); writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f); @@ -161,8 +161,10 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32); auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32); - writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, tile_src_w_minus_1); - writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, tile_src_h_minus_1); + writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, + tile_src_w_minus_1); + writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, + tile_src_h_minus_1); TensorTileSampler src_sampler; src_sampler.x(tile_co); @@ -199,7 +201,9 @@ void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, writer->op_assign(tile_dst, tile_src); } -void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const size_t width_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH); const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT); @@ -209,12 +213,16 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa const int32_t m0 = root_window.y().step(); const int32_t partial_n0 = _dst->dimension(0) % n0; - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); // Constants - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners()); + const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), + _attributes.align_corners()); + const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), + _attributes.align_corners()); const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x); const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y); const auto &tile_0 = writer->declare_tile("0", 0); @@ -251,7 +259,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32); const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32); - switch(_attributes.sampling_policy()) + switch (_attributes.sampling_policy()) { case SamplingPolicy::TOP_LEFT: // xi_f = (xo * scale_x) @@ -312,8 +320,10 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1); writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1); - writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, tile_src_w_minus_1); - writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, tile_src_h_minus_1); + writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, + tile_src_w_minus_1); + writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, + tile_src_h_minus_1); TensorTileSampler in_sampler; in_sampler.x(tile_co); @@ -388,7 +398,7 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float); writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1); - if(is_data_type_float(_src->data_type())) + if (is_data_type_float(_src->data_type())) { // Cast weights to source type const auto &tile_a_src_type = writer->declare_tile("a_src_t", to_ckw(_src->data_type())); @@ -461,9 +471,11 @@ void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVa } } -void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { - switch(_attributes.interpolation_policy()) + switch (_attributes.interpolation_policy()) { case InterpolationPolicy::NEAREST_NEIGHBOR: do_nearest_neighbor_resize(comp_group, vtable, writer); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp index 8917391537..889706b0c0 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp @@ -24,10 +24,12 @@ #include "GpuCkwStore.h" #include "arm_compute/core/Error.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" + #include namespace arm_compute @@ -37,12 +39,14 @@ namespace experimental namespace dynamic_fusion { GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack &tensors) - : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{} + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); } -void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h index 8e35651caf..f1f0e6747b 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h @@ -48,8 +48,10 @@ public: /** Destructor */ ~GpuCkwStore() override = default; // Inherited methods overriden: - virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override; - std::string get_name(const ComponentGroup &comp_group) const override; + virtual void write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const override; + std::string get_name(const ComponentGroup &comp_group) const override; private: const ITensorInfo *_src; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h index e2b8584b99..6ba2b2f651 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/Utility.h" #include "ckw/TensorTileSampler.h" + #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" @@ -44,9 +45,14 @@ using SamplerCreator = std::functionhas_tile()) + if (!src->has_tile()) { const auto sampler = create_sampler(writer, m0, n0); writer->op_load_once(src, sampler); @@ -61,7 +67,7 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri const auto &sampler = src->tile_sampler(); // Prepare the output tile. - if(!dst->has_tile()) + if (!dst->has_tile()) { auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info()); dst->init_virtual_tensor(tile, sampler); @@ -78,7 +84,13 @@ inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &wri * @param[in] prefix Prefix to all the tiles declared within this function * @param[in] const_0 Constant tile of value 0 */ -inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const TileOperand &gid, int32_t step_v, int32_t leftover_step_v, const std::string &prefix, const TileOperand &const_0) +inline void get_coord(GpuCkwScopedKernelWriter writer, + TileOperand &coord, + const TileOperand &gid, + int32_t step_v, + int32_t leftover_step_v, + const std::string &prefix, + const TileOperand &const_0) { auto &step = writer->declare_tile(prefix + "step", step_v); auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v); @@ -122,8 +134,15 @@ inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const * * @return TensorTileSampler */ -inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, TileOperand &gid_0, TileOperand &gid_1, int32_t dim0_v, int32_t dim1_v, int32_t n0_v, int32_t m0_v, - const std::string prefix, TileOperand &const_0) +inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, + TileOperand &gid_0, + TileOperand &gid_1, + int32_t dim0_v, + int32_t dim1_v, + int32_t n0_v, + int32_t m0_v, + const std::string prefix, + TileOperand &const_0) { // Clamp tile size [n0, m0] against dimension [dim0, dim1] // This is needed to: diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h index 34b1283add..5da317bf38 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" #include "ckw/TensorInfo.h" + #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" namespace arm_compute @@ -38,7 +39,7 @@ namespace dynamic_fusion { inline ckw::DataType to_ckw(DataType dt) { - switch(dt) + switch (dt) { case DataType::F32: return ckw::DataType::Fp32; @@ -65,21 +66,16 @@ inline ckw::DataType to_ckw(DataType dt) inline ckw::TensorShape to_ckw(const TensorShape &shape) { - ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size {}); - ARM_COMPUTE_ERROR_ON(std::tuple_size {} != 5); + ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size{}); + ARM_COMPUTE_ERROR_ON(std::tuple_size{} != 5); /// NOTE: Overflow danger. Use size_t? - return ckw::TensorShape - { - static_cast(shape[0]), - static_cast(shape[1]), - static_cast(shape[2]), - static_cast(shape[3]), - static_cast(shape[4]) - }; + return ckw::TensorShape{static_cast(shape[0]), static_cast(shape[1]), + static_cast(shape[2]), static_cast(shape[3]), + static_cast(shape[4])}; } inline ckw::TensorDataLayout to_ckw(DataLayout dl) { - switch(dl) + switch (dl) { case DataLayout::NHWC: return ckw::TensorDataLayout::Nhwc; @@ -91,18 +87,13 @@ inline ckw::TensorDataLayout to_ckw(DataLayout dl) } inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info) { - return ckw::TensorInfo - { - to_ckw(tensor_info.data_type()), - to_ckw(tensor_info.tensor_shape()), - to_ckw(tensor_info.data_layout()), - tensor_info.id() - }; + return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()), + to_ckw(tensor_info.data_layout()), tensor_info.id()}; } inline TensorComponentType from_ckw(const ckw::TensorComponentType &component) { - switch(component) + switch (component) { case ckw::TensorComponentType::OffsetFirstElement: return TensorComponentType::OffsetFirstElement; @@ -142,7 +133,7 @@ inline TensorComponentType from_ckw(const ckw::TensorComponentType &component) inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage) { - switch(storage) + switch (storage) { case TensorStorageType::ClBufferUint8Ptr: return ckw::TensorStorageType::BufferUint8Ptr; @@ -159,7 +150,7 @@ inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage) } inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage) { - switch(storage) + switch (storage) { case ckw::TensorStorageType::BufferUint8Ptr: return TensorStorageType::ClBufferUint8Ptr; diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h index 9cb022fc10..0cba258940 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h @@ -25,6 +25,7 @@ #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY #include "ckw/types/Operators.h" + #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" namespace arm_compute @@ -35,7 +36,7 @@ namespace dynamic_fusion { inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes) { - switch(attributes.operation()) + switch (attributes.operation()) { case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add: return ckw::BinaryOp::Add; -- cgit v1.2.1