diff options
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp')
-rw-r--r-- | src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp | 171 |
1 files changed, 90 insertions, 81 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp index 9c9a298132..8ab3ec3a55 100644 --- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp @@ -24,17 +24,18 @@ #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/Validate.h" #include "ckw/TensorTileSampler.h" + #include "src/core/helpers/WindowHelpers.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" -#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h" #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h" -#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" using namespace ckw; @@ -48,11 +49,7 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes, const Settings &settings) - : IGpuCkwComponentDriver{ id, tensors }, - _src{}, - _dst{}, - _attributes{ attributes }, - _settings{ settings } + : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings} { _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); @@ -60,14 +57,18 @@ GpuCkwPool2d::GpuCkwPool2d(ComponentId id, ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); } -void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const +void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, + GpuCkwVariableTable &vtable, + GpuCkwScopedKernelWriter writer) const { const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); - GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); + GpuCkwComponentArgument *src = + vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src"); + GpuCkwComponentArgument *dst = + vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst"); TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32); TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32); @@ -90,23 +91,26 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const auto src_data_type = _src->data_type(); // Check if this is global pooling path - const bool is_global_pooling = (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); + const bool is_global_pooling = + (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0); // Check if this a case of FP_MIXED_PRECISION - const bool use_fp_mixed_precision = (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; - const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); + const bool use_fp_mixed_precision = + (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX; + const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type); TileOperand &const_0 = writer->declare_tile("0", 0); const TileOperand &const_1 = writer->declare_tile("1", 1); const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits<float>::lowest()); const TileOperand &pool_size_x_tile = writer->declare_tile("POOL_SIZE_X", pool_size_x); const TileOperand &pool_size_y_tile = writer->declare_tile("POOL_SIZE_Y", pool_size_y); - const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x())); - const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y())); - const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); - const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); - const TileOperand &dst_height_tile = writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx))); - const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); - const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); + const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x())); + const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y())); + const TileOperand &pad_x_tile = writer->declare_tile("PAD_X", pad_x); + const TileOperand &pad_y_tile = writer->declare_tile("PAD_Y", pad_y); + const TileOperand &dst_height_tile = + writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx))); + const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height); + const TileOperand &src_width_tile = writer->declare_tile("SRC_WIDTH", src_width); TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32); TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32); @@ -145,7 +149,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw // Prepare dst tensor and tile TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0); - if(!dst->has_tile()) + if (!dst->has_tile()) { TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info); dst->init_virtual_tensor(dst_tile, dst_sampler); @@ -156,14 +160,15 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); // Initialise result tile with appropriate value - if(_attributes.pool_type() == PoolingType::MAX) + if (_attributes.pool_type() == PoolingType::MAX) { - if(_settings.use_inf_as_limit()) + if (_settings.use_inf_as_limit()) { TileContainer minus_inf_tile_container; std::vector<std::string> value = std::vector<std::string>(n0, "(-INFINITY)"); - minus_inf_tile_container.push_back({ value }); - const TileOperand &minus_inf = writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); + minus_inf_tile_container.push_back({value}); + const TileOperand &minus_inf = + writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type)); writer->op_assign(res_tile, minus_inf); } else @@ -209,7 +214,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e); const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32); - if(_attributes.exclude_padding()) + if (_attributes.exclude_padding()) { const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32); const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32); @@ -227,7 +232,7 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32); const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32); - if(is_global_pooling) + if (is_global_pooling) { writer->op_assign(x, const_0); writer->op_assign(y, const_0); @@ -242,76 +247,80 @@ void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkw } // Y dim for-loop - writer->op_for_loop(y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, [&]() - { - // Reset the iterator for the inner loop - if(is_global_pooling) - { - writer->op_assign(x, const_0); - } - else + writer->op_for_loop( + y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, + [&]() { - writer->op_assign(x, pool_x_s); - } - - TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); - writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); - - // X dim for-loop - writer->op_for_loop(x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, [&]() - { - TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); - writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); - - TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); - - src_sampler.y(a_x); - src_sampler.z(a_y); - - // Load src tile - if(use_fp_mixed_precision) + // Reset the iterator for the inner loop + if (is_global_pooling) { - TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); - writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); - writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + writer->op_assign(x, const_0); } else { - writer->op_load(src_tile, src->tensor(), src_sampler); + writer->op_assign(x, pool_x_s); } - // Take the square of the input, for L2 Pooling - if(_attributes.pool_type() == PoolingType::L2) - { - writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); - } - - // Perfom Pooling op - if(_attributes.pool_type() == PoolingType::MAX) - { - writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); - } - else - { - writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); - } + TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32); + writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y); + + // X dim for-loop + writer->op_for_loop( + x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, + [&]() + { + TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32); + writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x); + + TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0)); + + src_sampler.y(a_x); + src_sampler.z(a_y); + + // Load src tile + if (use_fp_mixed_precision) + { + TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info); + writer->op_load(src_uncasted_tile, src->tensor(), src_sampler); + writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None); + } + else + { + writer->op_load(src_tile, src->tensor(), src_sampler); + } + + // Take the square of the input, for L2 Pooling + if (_attributes.pool_type() == PoolingType::L2) + { + writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile); + } + + // Perfom Pooling op + if (_attributes.pool_type() == PoolingType::MAX) + { + writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile); + } + else + { + writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile); + } + }); }); - }); - if((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) + if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2)) { // filter_size is automatically broadcasted in the operation writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size); } // Take square root of the result in L2 pooling - if(_attributes.pool_type() == PoolingType::L2) + if (_attributes.pool_type() == PoolingType::L2) { writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile); } // Store the results and do casting if FP_MIXED_PRECISION - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None); } @@ -326,7 +335,7 @@ Window GpuCkwPool2d::get_window() const ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); TensorShape output_shape = _dst->tensor_shape(); - const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); + const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0)); // Create and configure kernel window auto win = calculate_max_window(output_shape, Steps(vec_size)); win = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size. |