aboutsummaryrefslogtreecommitdiff
path: root/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp')
-rw-r--r--src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp92
1 files changed, 45 insertions, 47 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
index 34840c2100..ebb0374501 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
@@ -23,14 +23,13 @@
*/
#include "ClTemplatePool2d.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
#include "support/StringSupport.h"
namespace arm_compute
@@ -50,11 +49,7 @@ ClTemplatePool2d::ClTemplatePool2d(ComponentId id,
const ArgumentPack<ITensorInfo> &tensors,
const Attributes &attributes,
const Settings &settings)
- : IGpuTemplateComponentWriter{ id, tensors },
- _src{},
- _dst{},
- _attributes{ attributes },
- _settings{ settings }
+ : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
{
_src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
_dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -71,7 +66,7 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
ARM_COMPUTE_UNUSED(comp_group);
// Condition to use 2x2 optimized kernel
- if(_attributes.pool_size() == Size2D(2, 2))
+ if (_attributes.pool_size() == Size2D(2, 2))
{
return get_2x2_kernel_code();
}
@@ -83,11 +78,13 @@ std::string ClTemplatePool2d::get_component_code(const ComponentGroup &comp_grou
std::string ClTemplatePool2d::get_MxN_kernel_code() const
{
- const auto pool_type = _attributes.pool_type();
- const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+ const auto pool_type = _attributes.pool_type();
+ const bool fp_mixed_precision =
+ (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
// Define pool op macro.
- std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+ std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+ : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
// Kernel start
// Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
@@ -129,7 +126,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
)_";
// Determine filter size depending on if padding is excluded or not
- if(_attributes.exclude_padding())
+ if (_attributes.exclude_padding())
{
code += R"_(
const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -144,7 +141,8 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
// Loop through pool size
// if global pooling
- if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
+ if (_attributes.pool_size().x() == _src->dimension(width_idx) &&
+ _attributes.pool_size().y() == _src->dimension(height_idx))
{
// Begin loop
code += R"_(
@@ -173,7 +171,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
// if condition inside loop - use 32bit acc if mixed_precision.
// End loop through pooling section.
- if(fp_mixed_precision)
+ if (fp_mixed_precision)
{
// In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
code += R"_(
@@ -194,7 +192,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
}
// For Pool AVG ONLY, divide pool output by filter size
- if(pool_type == PoolingType::AVG)
+ if (pool_type == PoolingType::AVG)
{
code += R"_(
res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -202,7 +200,7 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
}
// If mixed precision convert datatype before storing. Then end kernel.
- if(fp_mixed_precision)
+ if (fp_mixed_precision)
{
code += R"_(
VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -228,9 +226,11 @@ std::string ClTemplatePool2d::get_MxN_kernel_code() const
std::string ClTemplatePool2d::get_2x2_kernel_code() const
{
- const auto pool_type = _attributes.pool_type();
- const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
- std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+ const auto pool_type = _attributes.pool_type();
+ const bool fp_mixed_precision =
+ (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+ std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+ : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
std::string code = R"_(
//------------------ START KERNEL {{meta_kernel_id}} ---------------------
@@ -274,7 +274,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
)_";
- if(fp_mixed_precision)
+ if (fp_mixed_precision)
{
// In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
code += R"_(
@@ -294,7 +294,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
)_";
}
- if(pool_type != PoolingType::MAX)
+ if (pool_type != PoolingType::MAX)
{
// Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
code += R"_(
@@ -321,10 +321,10 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
res0 = POOL_OP(res0, data3);
)_";
- if(pool_type == PoolingType::AVG)
+ if (pool_type == PoolingType::AVG)
{
// If avg pooling divide result accordingly.
- if(_attributes.exclude_padding())
+ if (_attributes.exclude_padding())
{
code += R"_(
res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -339,7 +339,7 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
}
// Store result
- if(fp_mixed_precision)
+ if (fp_mixed_precision)
{
code += R"_(
VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -365,17 +365,11 @@ std::string ClTemplatePool2d::get_2x2_kernel_code() const
void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
{
- vtable.declare_variable(
- comp_group,
- _src,
- GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- "src");
-
- vtable.declare_variable(
- comp_group,
- _dst,
- GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
- "dst");
+ vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+ "src");
+
+ vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+ "dst");
}
TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -391,12 +385,15 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
lut["meta_kernel_id"] = id();
// Retrieve relevant data
- const auto padding = _attributes.pad();
- const auto stride = _attributes.stride();
- const auto pool_size = _attributes.pool_size();
- const auto data_type = _src->data_type();
- const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
- const std::string max_initial_value = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+ const auto padding = _attributes.pad();
+ const auto stride = _attributes.stride();
+ const auto pool_size = _attributes.pool_size();
+ const auto data_type = _src->data_type();
+ const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() &&
+ _attributes.pool_type() != PoolingType::MAX;
+ const std::string max_initial_value =
+ _settings.use_inf_as_limit() ? "(-INFINITY)"
+ : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
// pool specific
lut["STRIDE_X"] = stride.x();
@@ -407,7 +404,8 @@ TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const
lut["POOL_SIZE_Y"] = pool_size.height;
// Datatypes and variables
- lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
+ lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type(
+ (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
lut["DATA_TYPE"] = get_cl_type_from_data_type(data_type);
lut["SRC_WIDTH"] = _src->dimension(width_idx);
lut["SRC_HEIGHT"] = _src->dimension(height_idx);
@@ -454,14 +452,14 @@ std::string ClTemplatePool2d::get_config_id() const
std::set<std::string> ClTemplatePool2d::get_headers_list() const
{
- return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
+ return std::set<std::string>{"helpers.h", "tile_helpers.h", "repeat.h"};
}
Window ClTemplatePool2d::get_window() const
{
ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
const auto output_shape = _dst->tensor_shape();
- const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+ const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
// Create and configure kernel window
auto win = calculate_max_window(output_shape, Steps(vec_size));