diff options
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu')
-rw-r--r-- | src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp | 129 |
1 files changed, 16 insertions, 113 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp index caccbb1830..b08af61d8f 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -31,6 +31,8 @@ #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" namespace arm_compute { @@ -40,115 +42,6 @@ namespace dynamic_fusion { namespace { -bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target) -{ - if(!export_to_cl_image(weights)) - { - return false; - } - - const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - const size_t kernel_w = weights->tensor_shape()[idx_w]; - const size_t kernel_h = weights->tensor_shape()[idx_h]; - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - if((kernel_w == 1) && (kernel_h == 1)) - { - return false; - } - - if(depth_multiplier > 1) - { - if((depth_multiplier % 4) != 0) - { - return false; - } - } - - return true; -} - -void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, - const DepthwiseConv2dAttributes &attributes, const GPUTarget gpu_target) -{ - const unsigned int depth_multiplier = attributes.depth_multiplier(); - - // Floating point path - // First check if we can export to cl_image. - dwc_compute_info.export_input_to_cl_image = false; - dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target); - - // Set n0 - if(depth_multiplier == 1) - { - if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16) - { - dwc_compute_info.n0 = 8; - } - else - { - dwc_compute_info.n0 = 4; - } - } - else - { - if((depth_multiplier % 4) == 0) - { - dwc_compute_info.n0 = 4; - } - else if((depth_multiplier % 2) == 0) - { - dwc_compute_info.n0 = 2; - } - else - { - dwc_compute_info.n0 = 1; - } - } - - dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0)); - - // Set m0 only if stride_x == 1 and dilation_x == 1 - if(attributes.stride().x() == 1 && attributes.dilation().x() == 1) - { - const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const size_t kernel_w = weights->tensor_shape()[idx_w]; - - if((kernel_w >= 9) || (kernel_w == 1)) - { - dwc_compute_info.m0 = 1; - } - else - { - if(weights->data_type() == DataType::F16) - { - if((input->dimension(1) % 5) == 0) - { - dwc_compute_info.m0 = 5; - } - else - { - dwc_compute_info.m0 = 4; - } - } - else - { - dwc_compute_info.m0 = 2; - } - } - } - else - { - dwc_compute_info.m0 = 1; - } - return; -} - void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes) { if(dst->total_size() == 0U) @@ -202,8 +95,13 @@ Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &cont const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); auto settings = ClComponentDepthwiseConv2d::Settings(); - DWCComputeKernelInfo dwc_info; - initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + + // Get the depthwise convolution compute parameters + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.fast_relaxed_math( (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) @@ -294,8 +192,13 @@ void GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch, const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); auto settings = ClComponentDepthwiseConv2d::Settings(); - DWCComputeKernelInfo dwc_info; - initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + + // Get the depthwise convolution compute parameters + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) .m0(dwc_info.m0) |