From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- .../template_writer/cl/ClTemplateActivation.cpp | 26 ++--- .../gpu/template_writer/cl/ClTemplateActivation.h | 1 + .../gpu/template_writer/cl/ClTemplateCast.cpp | 30 +++--- .../cl/ClTemplateDepthwiseConv2d.cpp | 81 +++++++-------- .../template_writer/cl/ClTemplateDepthwiseConv2d.h | 1 + .../template_writer/cl/ClTemplateDirectConv2d.cpp | 112 +++++++++------------ .../template_writer/cl/ClTemplateDirectConv2d.h | 1 + .../cl/ClTemplateElementwiseBinary.cpp | 94 +++++++---------- .../cl/ClTemplateElementwiseBinary.h | 5 +- .../cl/ClTemplateLogits1DMaxShiftExpSum.cpp | 57 ++++------- .../cl/ClTemplateLogits1DMaxShiftExpSum.h | 4 +- .../template_writer/cl/ClTemplateLogits1DNorm.cpp | 35 ++----- .../gpu/template_writer/cl/ClTemplatePool2d.cpp | 92 +++++++++-------- .../gpu/template_writer/cl/ClTemplatePool2d.h | 1 + .../gpu/template_writer/cl/ClTemplateReshape.cpp | 28 ++---- .../gpu/template_writer/cl/ClTemplateReshape.h | 4 +- .../gpu/template_writer/cl/ClTemplateResize.cpp | 56 ++++++----- .../gpu/template_writer/cl/ClTemplateStore.cpp | 16 +-- .../gpu/template_writer/cl/ClTemplateStore.h | 1 + .../gpu/template_writer/cl/ClTemplateWriter.cpp | 59 ++++++----- 20 files changed, 306 insertions(+), 398 deletions(-) (limited to 'src/dynamic_fusion/sketch/gpu/template_writer/cl') diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp index 3c7c843dd8..c165fb5f33 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -39,10 +40,7 @@ namespace dynamic_fusion ClTemplateActivation::ClTemplateActivation(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST); @@ -62,7 +60,7 @@ std::string ClTemplateActivation::get_component_code(const ComponentGroup &comp_ code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- )_"; - if(is_root) + if (is_root) { code += R"_( // IN(src) {{src}} @@ -104,17 +102,11 @@ LOOP_UNROLLING(int, i, 0, 1, M0, void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -173,7 +165,7 @@ std::string ClTemplateActivation::get_config_id() const std::set ClTemplateActivation::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h", "activation_float_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"}; } Window ClTemplateActivation::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h index ec78cf6ce5..88ee370342 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp index 4956879ad3..0da3a73801 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -35,7 +36,7 @@ namespace experimental namespace dynamic_fusion { ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -62,7 +63,7 @@ std::string ClTemplateCast::get_component_code(const ComponentGroup &comp_group) //------------------ START KERNEL {{meta_kernel_id}} CAST --------------------- )_"; - if(is_root) + if (is_root) { code += R"_( // IN_0(src) {{src}} @@ -82,14 +83,15 @@ TILE(uint, M0, 1, g_dst_indirect_y); { )_"; - if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type())) + if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type())) { code += R"_( {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80; )_"; } - if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) + if (kernel_name == "cast_down" && + (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE)) { code += R"_( {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0)); @@ -106,7 +108,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); }) )_"; - if(is_root) + if (is_root) { code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, @@ -128,17 +130,11 @@ TILE(uint, M0, 1, g_dst_indirect_y); void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -199,7 +195,7 @@ std::string ClTemplateCast::get_config_id() const std::set ClTemplateCast::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateCast::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp index ab7cc9f05a..8380620ab2 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -36,17 +36,17 @@ ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _weight{}, _bias{}, _dst{}, - _attributes{ attributes }, - _settings{ settings } + _attributes{attributes}, + _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); - if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) { _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); } @@ -71,7 +71,7 @@ std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup & // IN_1(wei) {{weight}} )_"; - if(_bias != nullptr && _bias->has_valid_id()) + if (_bias != nullptr && _bias->has_valid_id()) { code += R"_( // IN_1(bia) {{bias}} @@ -113,7 +113,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); }) )_"; - if(_weight->dimension(height_idx) < 5) + if (_weight->dimension(height_idx) < 5) { code += R"_( LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, @@ -147,7 +147,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); { )_"; - if(!_settings.is_fma_available()) + if (!_settings.is_fma_available()) { code += R"_( {{dst}}[m0].v += a[xk + m0].v * b[xk].v; @@ -166,14 +166,14 @@ TILE(uint, M0, 1, g_dst_indirect_y); } )_"; - if(_weight->dimension(height_idx) < 5) + if (_weight->dimension(height_idx) < 5) { code += R"_( ) )_"; } - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}}); @@ -198,44 +198,31 @@ TILE(uint, M0, 1, g_dst_indirect_y); return code; } -void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ? - GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : - GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(input_type), - "src"); - - const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ? - GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : - GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - - vtable.declare_variable( - comp_group, - _weight, - GpuKernelArgumentInfo(weight_type), - "weight"); - - if(_bias != nullptr && _bias->has_valid_id()) // optional bias + const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight"); + + if (_bias != nullptr && _bias->has_valid_id()) // optional bias { - vtable.declare_variable( - comp_group, - _bias, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), - "bias"); + vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias"); } - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } -TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { TagLUT lut{}; @@ -243,7 +230,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab lut["src"] = vtable.get_variable(_src); lut["weight"] = vtable.get_variable(_weight); - if(_bias != nullptr && _bias->has_valid_id()) // optional bias + if (_bias != nullptr && _bias->has_valid_id()) // optional bias { lut["bias"] = vtable.get_variable(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); @@ -259,7 +246,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab lut["SRC_DATA_TYPE"] = _src->data_type(); lut["WEI_DATA_TYPE"] = _weight->data_type(); - switch(vtable.get_variable(_src).kernel_argument_info.type) + switch (vtable.get_variable(_src).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: @@ -271,7 +258,7 @@ TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtab break; } - switch(vtable.get_variable(_weight).kernel_argument_info.type) + switch (vtable.get_variable(_weight).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: @@ -318,7 +305,7 @@ CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup CLBuildOptions build_opts{}; - if(_settings.fast_relaxed_math()) + if (_settings.fast_relaxed_math()) { build_opts.add_option("-cl-fast-relaxed-math"); } @@ -361,7 +348,7 @@ std::string ClTemplateDepthwiseConv2d::get_config_id() const std::set ClTemplateDepthwiseConv2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateDepthwiseConv2d::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h index 84b689ef64..5d04c687c3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 3322487910..f6a7a58d1d 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplateDirectConv2d.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" - #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -43,17 +42,17 @@ ClTemplateDirectConv2d::ClTemplateDirectConv2d(ComponentId const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _weight{}, _bias{}, _dst{}, - _attributes{ attributes }, - _settings{ settings } + _attributes{attributes}, + _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); - if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) { _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); } @@ -79,7 +78,7 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com // IN_0(src) {{src}} // IN_1(wei) {{weight}} )_"; - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( // IN_1(bia) {{bias}} @@ -161,7 +160,7 @@ TILE(uint, M0, 1, g_dst_indirect_y); } )_"; - if(leftover_loop) + if (leftover_loop) { code += R"_( for(; ck < _ISRC_CHANNELS; ++ck) @@ -186,9 +185,9 @@ TILE(uint, M0, 1, g_dst_indirect_y); T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); } )_"; -} + } -code += R"_( + code += R"_( #undef _I_WEI_WIDTH #undef _I_WEI_HEIGHT #undef _ISRC_WIDTH @@ -202,7 +201,7 @@ code += R"_( } )_"; - if(_bias && _bias->has_valid_id()) + if (_bias && _bias->has_valid_id()) { code += R"_( TILE({{BIA_DATA_TYPE}}, 1, N0, bias0); @@ -211,9 +210,9 @@ code += R"_( T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); )_"; -} + } -code += R"_( + code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1); @@ -227,32 +226,19 @@ code += R"_( void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; - vtable.declare_variable( - comp_group, - _weight, - GpuKernelArgumentInfo(weight_type), - "weight"); - - if(_bias && _bias->has_valid_id()) // optional bias + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() + ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image + : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight"); + + if (_bias && _bias->has_valid_id()) // optional bias { - vtable.declare_variable( - comp_group, - _bias, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), - "bias"); + vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias"); } - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -262,7 +248,7 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, lut["src"] = vtable.get_variable(_src); lut["weight"] = vtable.get_variable(_weight); - if(_bias && _bias->has_valid_id()) // optional bias + if (_bias && _bias->has_valid_id()) // optional bias { lut["bias"] = vtable.get_variable(_bias); lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); @@ -279,34 +265,34 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, lut["WEI_DATA_TYPE"] = _weight->data_type(); lut["SRC_TENSOR_TYPE"] = "BUFFER"; - switch(vtable.get_variable(_weight).kernel_argument_info.type) + switch (vtable.get_variable(_weight).kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: - { - lut["WEI_TENSOR_TYPE"] = "IMAGE"; - break; - } + { + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + } default: - { - lut["WEI_TENSOR_TYPE"] = "BUFFER"; - break; - } + { + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } } - const auto width_idx = 1; - const auto height_idx = 2; + const auto width_idx = 1; + const auto height_idx = 2; const auto channel_idx = 0; - lut["SRC_WIDTH"] = _src->dimension(width_idx); - lut["SRC_HEIGHT"] = _src->dimension(height_idx); + lut["SRC_WIDTH"] = _src->dimension(width_idx); + lut["SRC_HEIGHT"] = _src->dimension(height_idx); lut["SRC_CHANNELS"] = _src->dimension(channel_idx); - lut["WEI_WIDTH"] = _weight->dimension(width_idx); - lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); - lut["DST_WIDTH"] = _dst->dimension(width_idx); - lut["DST_HEIGHT"] = _dst->dimension(height_idx); + lut["DST_WIDTH"] = _dst->dimension(width_idx); + lut["DST_HEIGHT"] = _dst->dimension(height_idx); lut["DST_CHANNELS"] = _dst->dimension(channel_idx); lut["STRIDE_X"] = _attributes.stride().x(); @@ -324,14 +310,14 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c { const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); - const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); - const unsigned int n0 = root_window.x().step(); - const unsigned int m0 = root_window.y().step(); - const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); const unsigned int partial_store_n0 = _dst->dimension(0) % n0; CLBuildOptions build_opts{}; - if(_settings.fast_relaxed_math()) + if (_settings.fast_relaxed_math()) { build_opts.add_option("-cl-fast-relaxed-math"); } @@ -379,7 +365,7 @@ std::string ClTemplateDirectConv2d::get_config_id() const std::set ClTemplateDirectConv2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateDirectConv2d::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h index 8988d3ca1c..03c8cd2f15 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h @@ -26,6 +26,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp index c0481ae190..78bff3c3f3 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplateElementwiseBinary.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" - #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -44,11 +43,7 @@ constexpr unsigned int vector_size_byte_opencl = 16; ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _lhs{}, - _rhs{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes} { _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -69,67 +64,67 @@ std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup const bool is_rhs_input = comp_group.is_input_tensor(_rhs); code = -R"_( + R"_( //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} --------------------- )_"; - if(is_root) + if (is_root) { code += -R"_( + R"_( TILE(uint, M0, 1, g_dst_indirect_y); )_"; } - if(is_lhs_input) + if (is_lhs_input) { code += -R"_( + R"_( TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}}); )_"; } - if(is_rhs_input) + if (is_rhs_input) { code += -R"_( + R"_( TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}}); )_"; } code += -R"_( + R"_( { )_"; - if(is_lhs_input) + if (is_lhs_input) { code += -R"_( + R"_( {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}}); )_"; } - if(is_rhs_input) + if (is_rhs_input) { code += -R"_( + R"_( {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w; T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}}); )_"; } code += -R"_( + R"_( T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}}); )_"; - if(is_root) + if (is_root) { // Calculate the destination indirect Y code += -R"_( + R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); @@ -139,7 +134,7 @@ R"_( } code += -R"_( + R"_( } //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} --------------------- )_"; @@ -147,28 +142,18 @@ R"_( return code; } -void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _lhs, - GpuKernelArgumentInfo(common_tensor_type), - "lhs"); - - vtable.declare_variable( - comp_group, - _rhs, - GpuKernelArgumentInfo(common_tensor_type), - "rhs"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs"); + + vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } -TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { TagLUT lut{}; @@ -182,7 +167,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["dst"] = vtable.get_variable(_dst); lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor()); - switch(_attributes.operation()) + switch (_attributes.operation()) { case Attributes::ElementwiseOp::Add: lut["ELTWISE_OP"] = "ADD"; @@ -197,10 +182,10 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); } - ARM_COMPUTE_ERROR_ON( - comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); - ARM_COMPUTE_ERROR_ON( - comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) && + detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0)); + ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) && + detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0)); // Set broadcast parameters // PRE: All tensors are broadcast-compatible @@ -228,9 +213,7 @@ TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vt lut["rhs_m0"] = (rhs_broadcast_yz) ? "1" : "M0"; lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1"; - lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : - (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : - ""; + lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : ""; return lut; } @@ -268,7 +251,7 @@ std::string ClTemplateElementwiseBinary::get_config_id() const std::set ClTemplateElementwiseBinary::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateElementwiseBinary::get_window() const @@ -279,8 +262,9 @@ Window ClTemplateElementwiseBinary::get_window() const // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged // This is in line with the collapsing convention used by operators like Conv2d output_shape.collapse(2U, 1U); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); - Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); return win; } diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h index 8cca954efe..991c0eca44 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -48,9 +49,7 @@ public: * @param[in] tensors Tensor arguments to the components * @param[in] attributes Component attributes */ - ClTemplateElementwiseBinary(ComponentId id, - const ArgumentPack &tensors, - const Attributes &attributes); + ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); /** Prevent instances of this class from being copy constructed */ ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp index a8d8d32b12..522c33a022 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -38,16 +39,12 @@ namespace dynamic_fusion { namespace { - constexpr unsigned int serial_vector_size = 8; +constexpr unsigned int serial_vector_size = 8; } // namespace ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _sum{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -79,7 +76,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component const bool beta_defined = (_attributes.beta() != 1.f); - if(beta_defined) + if (beta_defined) { code += R"_( VEC_TYPE beta = (VEC_TYPE){{BETA}}; @@ -91,7 +88,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component const unsigned int vector_size = adjust_vec_size(_serial_vector_size, reduction_dim_size); const bool non_multiple_of_n0 = ((reduction_dim_size % vector_size) != 0); - if(non_multiple_of_n0) + if (non_multiple_of_n0) { code += R"_( VEC_TYPE data = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr); @@ -111,19 +108,19 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component VEC_TYPE sum1D = 0; )_"; - if(non_multiple_of_n0) + if (non_multiple_of_n0) { code += R"_( data -= max_val; )_"; - if(beta_defined) + if (beta_defined) { code += R"_( data *= beta; )_"; } - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( VSTORE_PARTIAL(N0, PARTIAL_N0) @@ -153,14 +150,14 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component data -= max_val; )_"; - if(beta_defined) + if (beta_defined) { code += R"_( data *= beta; )_"; } - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( VSTORE(N0) @@ -191,28 +188,18 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_component_code(const Component return code; } -void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "src"); - - vtable.declare_variable( - comp_group, - _sum, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "sum"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src"); + + vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst"); } -TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, + const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); @@ -241,8 +228,8 @@ CLBuildOptions ClTemplateLogits1DMaxShiftExpSum::get_build_options(const Compone ARM_COMPUTE_UNUSED(comp_group); CLBuildOptions build_opts{}; - const unsigned int reduction_dim_size = _src->dimension(0); - const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size); + const unsigned int reduction_dim_size = _src->dimension(0); + const unsigned int vector_size = adjust_vec_size(serial_vector_size, reduction_dim_size); build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size))); @@ -264,7 +251,7 @@ std::string ClTemplateLogits1DMaxShiftExpSum::get_config_id() const std::set ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateLogits1DMaxShiftExpSum::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h index 5d232c0cf2..ac9ddaa9d4 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h @@ -46,7 +46,9 @@ public: * @param[in] tensors Tensor arguments to the components * @param[in] attributes Component attributes */ - ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes); + ClTemplateLogits1DMaxShiftExpSum(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes); /** Prevent instances of this class from being copy constructed */ ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp index 056e570a25..7d7c3e6673 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp @@ -25,6 +25,7 @@ #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" @@ -38,11 +39,7 @@ namespace dynamic_fusion ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _sum{}, - _dst{}, - _attributes{ attributes } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); @@ -76,7 +73,7 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr); )_"; - if(_attributes.is_log_softmax()) + if (_attributes.is_log_softmax()) { code += R"_( sum_val = log(sum_val); @@ -101,23 +98,11 @@ std::string ClTemplateLogits1DNorm::get_component_code(const ComponentGroup &com void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "src"); - - vtable.declare_variable( - comp_group, - _sum, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "sum"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src"); + + vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst"); } TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -168,14 +153,14 @@ std::string ClTemplateLogits1DNorm::get_config_id() const std::set ClTemplateLogits1DNorm::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateLogits1DNorm::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); constexpr unsigned int serial_vector_size = 16; - const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0)); + const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0)); Window win = calculate_max_window(*_src, Steps(vector_size)); return win.collapse(win, Window::DimZ); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp index 34840c2100..ebb0374501 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp @@ -23,14 +23,13 @@ */ #include "ClTemplatePool2d.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" -#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" #include "support/StringSupport.h" namespace arm_compute @@ -50,11 +49,7 @@ ClTemplatePool2d::ClTemplatePool2d(ComponentId id, const ArgumentPack &tensors, const Attributes &attributes, const Settings &settings) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -71,7 +66,7 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou ARM_COMPUTE_UNUSED(comp_group); // Condition to use 2x2 optimized kernel - if(_attributes.pool_size() == Size2D(2, 2)) + if (_attributes.pool_size() == Size2D(2, 2)) { return get_2x2_kernel_code(); } @@ -83,11 +78,13 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou std::string ClTemplatePool2d::get_MxN_kernel_code() const { - const auto pool_type = _attributes.pool_type(); - const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; + const auto pool_type = _attributes.pool_type(); + const bool fp_mixed_precision = + (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; // Define pool op macro. - std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; + std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" + : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; // Kernel start // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0 @@ -129,7 +126,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const )_"; // Determine filter size depending on if padding is excluded or not - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { code += R"_( const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); @@ -144,7 +141,8 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const // Loop through pool size // if global pooling - if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx)) + if (_attributes.pool_size().x() == _src->dimension(width_idx) && + _attributes.pool_size().y() == _src->dimension(height_idx)) { // Begin loop code += R"_( @@ -173,7 +171,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const // if condition inside loop - use 32bit acc if mixed_precision. // End loop through pooling section. - if(fp_mixed_precision) + if (fp_mixed_precision) { // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE code += R"_( @@ -194,7 +192,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const } // For Pool AVG ONLY, divide pool output by filter size - if(pool_type == PoolingType::AVG) + if (pool_type == PoolingType::AVG) { code += R"_( res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size; @@ -202,7 +200,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const } // If mixed precision convert datatype before storing. Then end kernel. - if(fp_mixed_precision) + if (fp_mixed_precision) { code += R"_( VEC_DATA_TYPE({{DATA_TYPE}}, N0) @@ -228,9 +226,11 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const std::string ClTemplatePool2d::get_2x2_kernel_code() const { - const auto pool_type = _attributes.pool_type(); - const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; - std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; + const auto pool_type = _attributes.pool_type(); + const bool fp_mixed_precision = + (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX; + std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" + : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_"; std::string code = R"_( //------------------ START KERNEL {{meta_kernel_id}} --------------------- @@ -274,7 +274,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0); )_"; - if(fp_mixed_precision) + if (fp_mixed_precision) { // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE code += R"_( @@ -294,7 +294,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const )_"; } - if(pool_type != PoolingType::MAX) + if (pool_type != PoolingType::MAX) { // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound) code += R"_( @@ -321,10 +321,10 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const res0 = POOL_OP(res0, data3); )_"; - if(pool_type == PoolingType::AVG) + if (pool_type == PoolingType::AVG) { // If avg pooling divide result accordingly. - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { code += R"_( res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size; @@ -339,7 +339,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const } // Store result - if(fp_mixed_precision) + if (fp_mixed_precision) { code += R"_( VEC_DATA_TYPE({{DATA_TYPE}}, N0) @@ -365,17 +365,11 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const @@ -391,12 +385,15 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["meta_kernel_id"] = id(); // Retrieve relevant data - const auto padding = _attributes.pad(); - const auto stride = _attributes.stride(); - const auto pool_size = _attributes.pool_size(); - const auto data_type = _src->data_type(); - const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const std::string max_initial_value = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits::lowest()); + const auto padding = _attributes.pad(); + const auto stride = _attributes.stride(); + const auto pool_size = _attributes.pool_size(); + const auto data_type = _src->data_type(); + const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && + _attributes.pool_type() != PoolingType::MAX; + const std::string max_initial_value = + _settings.use_inf_as_limit() ? "(-INFINITY)" + : float_to_string_with_full_precision(std::numeric_limits::lowest()); // pool specific lut["STRIDE_X"] = stride.x(); @@ -407,7 +404,8 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["POOL_SIZE_Y"] = pool_size.height; // Datatypes and variables - lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use. + lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type( + (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use. lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type); lut["SRC_WIDTH"] = _src->dimension(width_idx); lut["SRC_HEIGHT"] = _src->dimension(height_idx); @@ -454,14 +452,14 @@ std::string ClTemplatePool2d::get_config_id() const std::set ClTemplatePool2d::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h", "repeat.h" }; + return std::set{"helpers.h", "tile_helpers.h", "repeat.h"}; } Window ClTemplatePool2d::get_window() const { ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); const auto output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h index ef1c100f44..d1d3c01669 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/experimental/Types.h" #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp index 8b50f1e209..c882353fcb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -36,11 +37,8 @@ namespace dynamic_fusion { constexpr unsigned int vector_size_byte_opencl = 16; -ClTemplateReshape::ClTemplateReshape(ComponentId id, - const ArgumentPack &tensors) - : IGpuTemplateComponentWriter{ id, tensors }, - _src{}, - _dst{} +ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack &tensors) + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -97,23 +95,17 @@ TILE(uint, M0, 1, g_dst_indirect_y); void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(common_tensor_type), - "dst"); + vtable.declare_variable(comp_group, _src, + GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst"); } TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); - TagLUT lut{}; + TagLUT lut{}; // Arguments and global shared variables lut["src"] = vtable.get_variable(_src); @@ -153,7 +145,7 @@ std::string ClTemplateReshape::get_config_id() const std::set ClTemplateReshape::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateReshape::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h index 56b6585b61..838a21db6d 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -42,8 +43,7 @@ public: * @param[in] id Component id * @param[in] tensors Tensor arguments to the components */ - ClTemplateReshape(ComponentId id, - const ArgumentPack &tensors); + ClTemplateReshape(ComponentId id, const ArgumentPack &tensors); /** Prevent instances of this class from being copy constructed */ ClTemplateReshape(const ClTemplateReshape &reshape) = delete; /** Prevent instances of this class from being copied */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp index aaed1d990d..846c712ceb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" @@ -37,8 +38,10 @@ namespace experimental { namespace dynamic_fusion { -ClTemplateResize::ClTemplateResize(ComponentId id, const ArgumentPack &tensors, const ClTemplateResize::Attributes &attributes) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes } +ClTemplateResize::ClTemplateResize(ComponentId id, + const ArgumentPack &tensors, + const ClTemplateResize::Attributes &attributes) + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -63,9 +66,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); const int bout = g_ind_2 / {{arg_dst}}_h; )_"; - if(_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR) + if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR) { - if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) + if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) { code += R"_( float xi_f = (g_ind_1 * {{SCALE_X}}); @@ -80,7 +83,7 @@ TILE(uint, 1, 1, g_dst_indirect_y); )_"; } - if(_attributes.align_corners()) + if (_attributes.align_corners()) { code += R"_( xi_f = round(xi_f); @@ -95,9 +98,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}}); )_"; } - else if(_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR) + else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR) { - if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) + if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT) { code += R"_( float xi_f = (g_ind_1 * {{SCALE_X}}); @@ -137,7 +140,7 @@ TILE(uint, 1, 1, g_dst_indirect_y); T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11); )_"; - if(is_data_type_float(_src->data_type())) + if (is_data_type_float(_src->data_type())) { code += R"_( const {{SRC_DATA_TYPE}} a = ({{SRC_DATA_TYPE}})(xi_f - (float)xi); @@ -158,9 +161,9 @@ TILE(uint, 1, 1, g_dst_indirect_y); const float b1 = (1.f - a1); {{dst}}[0].v = CONVERT_SAT( - (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + + (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + - (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + + (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0)); )_"; } @@ -179,22 +182,18 @@ TILE(uint, 1, 1, g_dst_indirect_y); return code; } -void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const +void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, + const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } -TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const +TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, + const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const { TagLUT lut{}; @@ -212,8 +211,10 @@ TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const lut["DST_DATA_TYPE"] = get_cl_type_from_data_type(_dst->data_type()); lut["CONSTANT_VALUE"] = string_from_pixel_value(0, _src->data_type()); - const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners()); - const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners()); + const float scale_x = + scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners()); + const float scale_y = + scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners()); lut["SCALE_X"] = float_to_string_with_full_precision(scale_x); lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y); @@ -242,7 +243,8 @@ std::string ClTemplateResize::get_config_id() const std::string config_id{}; config_id += "resize_"; - config_id += (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : ""); + config_id += + (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : ""); config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : ""); config_id += "_"; config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft"); @@ -260,7 +262,7 @@ std::string ClTemplateResize::get_config_id() const std::set ClTemplateResize::get_headers_list() const { - return std::set{ "helpers.h", "tile_helpers.h" }; + return std::set{"helpers.h", "tile_helpers.h"}; } Window ClTemplateResize::get_window() const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index 217214ced3..d0ec91e0a9 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -32,7 +32,7 @@ namespace experimental namespace dynamic_fusion { ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack &tensors) - : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{} + : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); @@ -61,16 +61,10 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - vtable.declare_variable( - comp_group, - _src, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "src"); - vtable.declare_variable( - comp_group, - _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), - "dst"); + vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "src"); + vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + "dst"); } TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h index 3f97a82204..b8c82ceadd 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE #include "arm_compute/core/experimental/Types.h" + #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp index eda15f1d95..d3d7c8db83 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp @@ -24,6 +24,7 @@ #include "ClTemplateWriter.h" #include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" @@ -39,11 +40,11 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con std::string replaced_code = ""; bool scanning_pattern = false; std::string pattern_found = ""; - for(size_t i = 0; i < code_template.size() - 1; ++i) + for (size_t i = 0; i < code_template.size() - 1; ++i) { - if(!scanning_pattern) + if (!scanning_pattern) { - if(code_template[i] == '{' && code_template[i + 1] == '{') + if (code_template[i] == '{' && code_template[i + 1] == '{') { i += 1; scanning_pattern = true; @@ -56,7 +57,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con } else { - if(code_template[i] == '}' && code_template[i + 1] == '}') + if (code_template[i] == '}' && code_template[i + 1] == '}') { i += 1; scanning_pattern = false; @@ -76,8 +77,7 @@ std::string ClTemplateWriter::replace_tags(const std::string &code_template, con ClTemplateWriter::~ClTemplateWriter() { } -ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) - : _components{ components } +ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components} { } std::string ClTemplateWriter::get_name() @@ -91,7 +91,7 @@ std::string ClTemplateWriter::get_code() std::string ClTemplateWriter::get_config_id() { std::string config_id = get_name(); - for(const auto &comp : _components) + for (const auto &comp : _components) { config_id += "--" + comp->template_writer()->get_config_id() + "--"; } @@ -103,7 +103,7 @@ CLBuildOptions ClTemplateWriter::get_build_options() { CLBuildOptions build_opts{}; - for(const auto &comp : _components) + for (const auto &comp : _components) { build_opts.add_options(comp->template_writer()->get_build_options(_components).options()); } @@ -122,11 +122,9 @@ std::map ClTemplateWriter::get_tensors() { // Assemble GpuKernelArguments std::map tensors; - for(const auto t : _components.get_argument_tensors()) + for (const auto t : _components.get_argument_tensors()) { - tensors.emplace( - t->id(), - GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info }); + tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info}); } return tensors; } @@ -141,22 +139,24 @@ std::string ClTemplateWriter::write_code() std::vector component_codes{}; // vector because order matters // Pass 1: Declare all kernel variables - for(auto &component : _components) + for (auto &component : _components) { component->template_writer()->declare_variables(_vtable, _components); } // Pass 2: Generate component codes - for(auto &component : _components) + for (auto &component : _components) { const auto component_writer = component->template_writer(); auto curr_headers_list = component_writer->get_headers_list(); auto curr_additional_macros = component_writer->get_additional_macros(); auto curr_component_code = component_writer->get_component_code(_components); - const auto var_lut = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique + const auto var_lut = component_writer->get_tag_lut( + _vtable, + _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique component_codes.push_back(replace_tags(curr_component_code, var_lut)); headers_list.insert(curr_headers_list.begin(), curr_headers_list.end()); - if(!additional_macros.empty()) // Some components might not have any + if (!additional_macros.empty()) // Some components might not have any { additional_macros.insert(replace_tags(curr_additional_macros, var_lut)); } @@ -165,7 +165,7 @@ std::string ClTemplateWriter::write_code() // Step 3: Assemble the data gathered by traversing the graph into the string "code" std::string code = ""; - for(auto &header : headers_list) + for (auto &header : headers_list) { #if defined(EMBEDDED_KERNELS) code += CLKernelLibrary::get().get_program(header).first; @@ -174,16 +174,14 @@ std::string ClTemplateWriter::write_code() #endif // defined(EMBEDDED_KERNELS) } - for(auto ¯os : additional_macros) + for (auto ¯os : additional_macros) { code += macros; } auto arguments = _components.get_argument_tensors(); - std::sort(arguments.begin(), arguments.end(), [](const ITensorInfo * l, const ITensorInfo * r) - { - return l->id() < r->id(); - }); + std::sort(arguments.begin(), arguments.end(), + [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); }); code += write_kernel_signature(_vtable.get_variable_list(arguments)); code += "\n{\n\n"; @@ -198,7 +196,7 @@ std::string ClTemplateWriter::write_code() tiles_ss << " //------------------ START TILE DECLARATION ---------------------\n"; - for(auto tile : tiles) + for (auto tile : tiles) { const auto var = _vtable.get_variable(tile); const auto data_type = get_cl_type_from_data_type(tile->data_type()); @@ -212,7 +210,7 @@ std::string ClTemplateWriter::write_code() code += tiles_ss.str(); } - for(const auto &component_code : component_codes) + for (const auto &component_code : component_codes) { code += component_code; code += "\n"; @@ -231,7 +229,8 @@ std::string ClTemplateWriter::write_global_section() const auto leftover_w = dst_w % tile_w; std::string code = ""; - code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n"; + code += std::string(" int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + + std::to_string(leftover_w) + ");\n"; code += std::string(" int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n"; code += std::string(" int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n"); @@ -243,7 +242,7 @@ std::string ClTemplateWriter::write_global_section() const std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const { std::string code; - switch(var.kernel_argument_info.type) + switch (var.kernel_argument_info.type) { case GpuKernelArgumentInfo::Type::Vector: { @@ -293,11 +292,11 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl { std::string code = "\n__kernel void " + write_kernel_name() + "("; - for(int i = 0; i < static_cast(argument_list.size()) - 1; ++i) + for (int i = 0; i < static_cast(argument_list.size()) - 1; ++i) { code += write_argument_declaration(argument_list[i]) + ","; } - if(static_cast(argument_list.size()) - 1 >= 0) + if (static_cast(argument_list.size()) - 1 >= 0) { code += write_argument_declaration(argument_list[argument_list.size() - 1]); } @@ -308,12 +307,12 @@ std::string ClTemplateWriter::write_kernel_signature(const GpuKernelVariableTabl } std::string ClTemplateWriter::write_kernel_name() const { - if(_components.empty()) + if (_components.empty()) { return "empty_kernel"; } std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name(); - for(size_t i = 1; i < _components.size(); ++i) + for (size_t i = 1; i < _components.size(); ++i) { name += "___"; name += _components[i]->template_writer()->get_name(); -- cgit v1.2.1