From 9d3bd41030366326e9c8afe5db3a5812a76b135b Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 30 Dec 2022 09:45:00 +0000 Subject: Move DWC native heuristic into the heuristic folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move the DWC native heuristic from CLDepthwiseConvolutionLayer to heuristic/ - Update the heuristic for Arm® Mali™-G77. Use a smaller block size (4x2) for Fp16 - Call the new heuristic in GpuDepthwiseConv2d Resolves COMPMID-5798 Signed-off-by: Gian Marco Iodice Change-Id: I6bfd30cea76bea2e98202a7a5c1d51709f3382a4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8889 Comments-Addressed: Arm Jenkins Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- Android.bp | 3 + arm_compute/core/KernelDescriptors.h | 4 +- filelist.json | 3 + .../sketch/gpu/operators/GpuDepthwiseConv2d.cpp | 129 ++------- .../CL/functions/CLDepthwiseConvolutionLayer.cpp | 151 +--------- .../dwc_native/ClDWCNativeDefaultConfigBifrost.cpp | 279 +++++++++++++++++++ .../dwc_native/ClDWCNativeDefaultConfigBifrost.h | 61 ++++ .../dwc_native/ClDWCNativeDefaultConfigValhall.cpp | 306 +++++++++++++++++++++ .../dwc_native/ClDWCNativeDefaultConfigValhall.h | 59 ++++ .../dwc_native/ClDWCNativeHeuristicsHelpers.cpp | 61 ++++ .../dwc_native/ClDWCNativeHeuristicsHelpers.h | 45 +++ .../dwc_native/ClDWCNativeKernelConfig.h | 65 +++++ .../dwc_native/IClDWCNativeKernelConfig.h | 118 ++++++++ .../ClIndirectConvDefaultConfigValhall.cpp | 2 +- .../ClIndirectConvDefaultConfigValhall.h | 2 +- .../indirect_conv/ClIndirectConvKernelConfig.h | 2 +- .../indirect_conv/IClIndirectConvKernelConfig.h | 6 +- 17 files changed, 1036 insertions(+), 260 deletions(-) create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h create mode 100644 src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h create mode 100644 src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h diff --git a/Android.bp b/Android.bp index bf6ee147f6..ec7bccd819 100644 --- a/Android.bp +++ b/Android.bp @@ -957,6 +957,9 @@ cc_library_static { "src/runtime/Utils.cpp", "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp", "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp", + "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp", + "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp", + "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp", "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp", "utils/CommonGraphOptions.cpp", "utils/GraphUtils.cpp", diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h index cacbef25ea..4a64032b14 100644 --- a/arm_compute/core/KernelDescriptors.h +++ b/arm_compute/core/KernelDescriptors.h @@ -104,8 +104,8 @@ struct GEMMKernelInfo /** Compute descriptor used by the depthwise convolution native kernel */ struct DWCComputeKernelInfo { - unsigned int n0{ 0 }; /**< Number of columns processed by each thread */ - unsigned int m0{ 0 }; /**< Number of rows processed by each thread */ + unsigned int n0{ 1 }; /**< Number of columns processed by each thread */ + unsigned int m0{ 1 }; /**< Number of rows processed by each thread */ bool export_input_to_cl_image{ false }; /**< Export input to cl_image */ bool export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */ }; diff --git a/filelist.json b/filelist.json index 5828b43ec1..7bc47f7a5c 100644 --- a/filelist.json +++ b/filelist.json @@ -502,6 +502,9 @@ "src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp", "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp", "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp", + "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp", + "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp", + "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp", "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp" ] } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp index caccbb1830..b08af61d8f 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -31,6 +31,8 @@ #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" namespace arm_compute { @@ -40,115 +42,6 @@ namespace dynamic_fusion { namespace { -bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target) -{ - if(!export_to_cl_image(weights)) - { - return false; - } - - const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - const size_t kernel_w = weights->tensor_shape()[idx_w]; - const size_t kernel_h = weights->tensor_shape()[idx_h]; - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - if((kernel_w == 1) && (kernel_h == 1)) - { - return false; - } - - if(depth_multiplier > 1) - { - if((depth_multiplier % 4) != 0) - { - return false; - } - } - - return true; -} - -void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, - const DepthwiseConv2dAttributes &attributes, const GPUTarget gpu_target) -{ - const unsigned int depth_multiplier = attributes.depth_multiplier(); - - // Floating point path - // First check if we can export to cl_image. - dwc_compute_info.export_input_to_cl_image = false; - dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target); - - // Set n0 - if(depth_multiplier == 1) - { - if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16) - { - dwc_compute_info.n0 = 8; - } - else - { - dwc_compute_info.n0 = 4; - } - } - else - { - if((depth_multiplier % 4) == 0) - { - dwc_compute_info.n0 = 4; - } - else if((depth_multiplier % 2) == 0) - { - dwc_compute_info.n0 = 2; - } - else - { - dwc_compute_info.n0 = 1; - } - } - - dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0)); - - // Set m0 only if stride_x == 1 and dilation_x == 1 - if(attributes.stride().x() == 1 && attributes.dilation().x() == 1) - { - const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const size_t kernel_w = weights->tensor_shape()[idx_w]; - - if((kernel_w >= 9) || (kernel_w == 1)) - { - dwc_compute_info.m0 = 1; - } - else - { - if(weights->data_type() == DataType::F16) - { - if((input->dimension(1) % 5) == 0) - { - dwc_compute_info.m0 = 5; - } - else - { - dwc_compute_info.m0 = 4; - } - } - else - { - dwc_compute_info.m0 = 2; - } - } - } - else - { - dwc_compute_info.m0 = 1; - } - return; -} - void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes) { if(dst->total_size() == 0U) @@ -202,8 +95,13 @@ Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &cont const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); auto settings = ClComponentDepthwiseConv2d::Settings(); - DWCComputeKernelInfo dwc_info; - initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + + // Get the depthwise convolution compute parameters + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.fast_relaxed_math( (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) @@ -294,8 +192,13 @@ void GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch, const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); auto settings = ClComponentDepthwiseConv2d::Settings(); - DWCComputeKernelInfo dwc_info; - initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + + // Get the depthwise convolution compute parameters + auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier()); settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) .m0(dwc_info.m0) diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp index 3eadaee0de..3909c15352 100644 --- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp @@ -23,15 +23,15 @@ */ #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h" -#include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" #include "src/common/utils/Log.h" @@ -39,137 +39,7 @@ namespace arm_compute { using namespace arm_compute::misc; using namespace arm_compute::misc::shape_calculator; - -namespace -{ -bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target) -{ - if(!export_to_cl_image(weights)) - { - return false; - } - - const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); - const size_t kernel_w = weights->tensor_shape()[idx_w]; - const size_t kernel_h = weights->tensor_shape()[idx_h]; - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - if((kernel_w == 1) && (kernel_h == 1)) - { - return false; - } - - if(depth_multiplier > 1) - { - if((depth_multiplier % 4) != 0) - { - return false; - } - } - - return true; -} - -void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, const PadStrideInfo &conv_info, const Size2D &dilation, - unsigned int depth_multiplier, - GPUTarget gpu_target) -{ - ARM_COMPUTE_UNUSED(input); - - if(!is_data_type_float(weights->data_type())) - { - dwc_compute_info.export_weights_to_cl_image = false; - dwc_compute_info.n0 = (depth_multiplier == 1) ? 4 : 1; - if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) - { - dwc_compute_info.m0 = 2; - } - else - { - dwc_compute_info.m0 = 1; - } - - return; - } - - // Floating point path - - // First check if we can export to cl_image. - dwc_compute_info.export_input_to_cl_image = false; - dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target); - - // Set n0 - if(depth_multiplier == 1) - { - if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16) - { - dwc_compute_info.n0 = 8; - } - else - { - dwc_compute_info.n0 = 4; - } - } - else - { - if((depth_multiplier % 4) == 0) - { - dwc_compute_info.n0 = 4; - } - else if((depth_multiplier % 2) == 0) - { - dwc_compute_info.n0 = 2; - } - else - { - dwc_compute_info.n0 = 1; - } - } - - dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0)); - - // Set m0 only if stride_x == 1 and dilation_x == 1 - if(conv_info.stride().first == 1 && dilation.x() == 1) - { - const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const size_t kernel_w = weights->tensor_shape()[idx_w]; - - if((kernel_w >= 9) || (kernel_w == 1)) - { - dwc_compute_info.m0 = 1; - } - else - { - if(weights->data_type() == DataType::F16) - { - if((input->dimension(1) % 5) == 0) - { - dwc_compute_info.m0 = 5; - } - else - { - dwc_compute_info.m0 = 4; - } - } - else - { - dwc_compute_info.m0 = 2; - } - } - } - else - { - dwc_compute_info.m0 = 1; - } - return; -} - -} // namespace +using namespace arm_compute::cl_dwc; CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), @@ -261,8 +131,9 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont output_shifts_to_use = &_output_shifts; } - DWCComputeKernelInfo dwc_native_compute_info; - initialize_dwc_native_compute_info(dwc_native_compute_info, input->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier, gpu_target); + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier); const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation }; @@ -346,8 +217,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - DWCComputeKernelInfo dwc_native_compute_info; - initialize_dwc_native_compute_info(dwc_native_compute_info, input, &permuted_weights, conv_info, dilation, depth_multiplier, gpu_target); + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier); ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); @@ -355,8 +227,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe } else { - DWCComputeKernelInfo dwc_native_compute_info; - initialize_dwc_native_compute_info(dwc_native_compute_info, input, weights, conv_info, dilation, depth_multiplier, gpu_target); + // Get the depthwise convolution compute parameters + auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target); + const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier); ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info)); } diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..f55685ee49 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +namespace +{ +DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier, bool is_g71) +{ + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if(is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if(depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(conv_info.stride().first == 1 && dilation.x() == 1) + { + if((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier, bool is_g71) +{ + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if(is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if(depth_multiplier == 1) + { + if(desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(conv_info.stride().first == 1 && dilation.x() == 1) + { + if((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace + +ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) + : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32, + &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ClDWCNativeConfigArray configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch(_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_G7x.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h new file mode 100644 index 0000000000..cec2cae5dd --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Bifrost based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..49485c83a9 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) + : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32, + &ClDWCNativeDefaultConfigValhall::configure_G78_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ClDWCNativeConfigArray configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32, + &ClDWCNativeDefaultConfigValhall::configure_G77_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch(_target) + { + case GPUTarget::G77: + func = configs_G77.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if(depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(conv_info.stride().first == 1 && dilation.x() == 1) + { + if((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if(depth_multiplier == 1) + { + if(desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(conv_info.stride().first == 1 && dilation.x() == 1) + { + if((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if(src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if(depth_multiplier == 1) + { + if(desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(conv_info.stride().first == 1 && dilation.x() == 1) + { + if((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..4d51fa668c --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Valhall based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp new file mode 100644 index 0000000000..5593c6de61 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier) +{ + // Check whether we can use the cl image with the weights. + if(!export_to_cl_image(weights)) + { + return false; + } + + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + const size_t kernel_h = weights->tensor_shape()[idx_h]; + + // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons: + // 1- When the kernel size is 1x1 + // 2- When the depth multiplier is greater than 1 and not multiple of 4. + if((kernel_w == 1) && (kernel_h == 1)) + { + return false; + } + + if((depth_multiplier > 1) && (depth_multiplier % 4) != 0) + { + return false; + } + + return true; +} +} // namespace cl_dwc +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h new file mode 100644 index 0000000000..e3484c04ff --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS + +namespace arm_compute +{ +// Forward declaration +class ITensorInfo; + +namespace cl_dwc +{ +/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance + * + * @param[in] weights Weights TensorInfo of the depthwise convolution + * @param[in] depth_multiplier Depth multiplier + * + * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance + */ +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier); + +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..c08053dcb3 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +#include + +namespace arm_compute +{ +namespace cl_dwc +{ +/** ClDWCNativeKernelConfigurationFactory factory class */ +class ClDWCNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDWCNativeKernelConfig + */ + static std::unique_ptr create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + // The heuristic for Midgard is the same as the one used for Arm Mali-G71 + return std::make_unique(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique(gpu); + case GPUTarget::VALHALL: + return std::make_unique(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..b5df132a12 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Basic container for the OpenCL depthwise convolution configuration functions */ +template +class ClDWCNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for depthwise convolution F32 + * @param[in] func_f16 Function to call for depthwise convolution F16 + * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) + : _configs{ func_f32, func_f16, func_int8 } + { + } + + /** Method to return the depthwise convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch(data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array _configs; +}; + +/** Basic interface for the depthwise convolution kernel configuration */ +class IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDWCNativeKernelConfig(GPUTarget arch) + : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClDWCNativeKernelConfig() = default; + /** This method returns the @ref DWCComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + * @param[in] dilation Kernel dilation + * @param[in] depth_multiplier Output feature maps multiplier + */ + virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp index 5d3dbf3146..990f050112 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp @@ -157,5 +157,5 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1 return desc; } -} // namespace opencl +} // namespace cl_indirect_conv } // namespace arm_compute diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h index cd9a6a5c37..68dca91885 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h @@ -47,6 +47,6 @@ private: DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); }; -} // namespace opencl +} // namespace cl_indirect_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h index 13716efb5f..73fbb87560 100644 --- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -56,6 +56,6 @@ public: } } }; -} // namespace opencl +} // namespace cl_indirect_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h index e722488c3b..d2f4cde662 100644 --- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h +++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h @@ -45,8 +45,8 @@ public: /** Constructor * - * @param[in] func_f32 Function to call for direct convolution F32 - * @param[in] func_f16 Function to call for direct convolution F16 + * @param[in] func_f32 Function to call for indirect convolution F32 + * @param[in] func_f16 Function to call for indirect convolution F16 * */ ClIndirectConvConfigArray(T func_f32, T func_f16) @@ -103,6 +103,6 @@ public: protected: GPUTarget _target; }; -} // namespace opencl +} // namespace cl_indirect_conv } // namespace arm_compute #endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */ -- cgit v1.2.1