aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2022-12-30 09:45:00 +0000
committerGian Marco Iodice <gianmarco.iodice@arm.com>2022-12-30 13:33:34 +0000
commit9d3bd41030366326e9c8afe5db3a5812a76b135b (patch)
treee2ee02632bca1a8f671043b2b6ee1f35f8f9c9d5
parentb7e8626717b2ef81b0d03284c8f6ffdbe9cd2245 (diff)
downloadComputeLibrary-9d3bd41030366326e9c8afe5db3a5812a76b135b.tar.gz
Move DWC native heuristic into the heuristic folder
- Move the DWC native heuristic from CLDepthwiseConvolutionLayer to heuristic/ - Update the heuristic for Arm® Mali™-G77. Use a smaller block size (4x2) for Fp16 - Call the new heuristic in GpuDepthwiseConv2d Resolves COMPMID-5798 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Change-Id: I6bfd30cea76bea2e98202a7a5c1d51709f3382a4 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8889 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp3
-rw-r--r--arm_compute/core/KernelDescriptors.h4
-rw-r--r--filelist.json3
-rw-r--r--src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp129
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp151
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp279
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h61
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp306
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h59
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp61
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h45
-rw-r--r--src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h65
-rw-r--r--src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h118
-rw-r--r--src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp2
-rw-r--r--src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h2
-rw-r--r--src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h2
-rw-r--r--src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h6
17 files changed, 1036 insertions, 260 deletions
diff --git a/Android.bp b/Android.bp
index bf6ee147f6..ec7bccd819 100644
--- a/Android.bp
+++ b/Android.bp
@@ -957,6 +957,9 @@ cc_library_static {
"src/runtime/Utils.cpp",
"src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp",
"src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp",
+ "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp",
+ "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp",
+ "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp",
"src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp",
"utils/CommonGraphOptions.cpp",
"utils/GraphUtils.cpp",
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index cacbef25ea..4a64032b14 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -104,8 +104,8 @@ struct GEMMKernelInfo
/** Compute descriptor used by the depthwise convolution native kernel */
struct DWCComputeKernelInfo
{
- unsigned int n0{ 0 }; /**< Number of columns processed by each thread */
- unsigned int m0{ 0 }; /**< Number of rows processed by each thread */
+ unsigned int n0{ 1 }; /**< Number of columns processed by each thread */
+ unsigned int m0{ 1 }; /**< Number of rows processed by each thread */
bool export_input_to_cl_image{ false }; /**< Export input to cl_image */
bool export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
};
diff --git a/filelist.json b/filelist.json
index 5828b43ec1..7bc47f7a5c 100644
--- a/filelist.json
+++ b/filelist.json
@@ -502,6 +502,9 @@
"src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp",
"src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp",
"src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp",
+ "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp",
+ "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp",
+ "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp",
"src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp"
]
}
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
index caccbb1830..b08af61d8f 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
@@ -31,6 +31,8 @@
#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
namespace arm_compute
{
@@ -40,115 +42,6 @@ namespace dynamic_fusion
{
namespace
{
-bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target)
-{
- if(!export_to_cl_image(weights))
- {
- return false;
- }
-
- const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t kernel_w = weights->tensor_shape()[idx_w];
- const size_t kernel_h = weights->tensor_shape()[idx_h];
-
- if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
- {
- return false;
- }
-
- if((kernel_w == 1) && (kernel_h == 1))
- {
- return false;
- }
-
- if(depth_multiplier > 1)
- {
- if((depth_multiplier % 4) != 0)
- {
- return false;
- }
- }
-
- return true;
-}
-
-void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights,
- const DepthwiseConv2dAttributes &attributes, const GPUTarget gpu_target)
-{
- const unsigned int depth_multiplier = attributes.depth_multiplier();
-
- // Floating point path
- // First check if we can export to cl_image.
- dwc_compute_info.export_input_to_cl_image = false;
- dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target);
-
- // Set n0
- if(depth_multiplier == 1)
- {
- if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16)
- {
- dwc_compute_info.n0 = 8;
- }
- else
- {
- dwc_compute_info.n0 = 4;
- }
- }
- else
- {
- if((depth_multiplier % 4) == 0)
- {
- dwc_compute_info.n0 = 4;
- }
- else if((depth_multiplier % 2) == 0)
- {
- dwc_compute_info.n0 = 2;
- }
- else
- {
- dwc_compute_info.n0 = 1;
- }
- }
-
- dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
-
- // Set m0 only if stride_x == 1 and dilation_x == 1
- if(attributes.stride().x() == 1 && attributes.dilation().x() == 1)
- {
- const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const size_t kernel_w = weights->tensor_shape()[idx_w];
-
- if((kernel_w >= 9) || (kernel_w == 1))
- {
- dwc_compute_info.m0 = 1;
- }
- else
- {
- if(weights->data_type() == DataType::F16)
- {
- if((input->dimension(1) % 5) == 0)
- {
- dwc_compute_info.m0 = 5;
- }
- else
- {
- dwc_compute_info.m0 = 4;
- }
- }
- else
- {
- dwc_compute_info.m0 = 2;
- }
- }
- }
- else
- {
- dwc_compute_info.m0 = 1;
- }
- return;
-}
-
void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes)
{
if(dst->total_size() == 0U)
@@ -202,8 +95,13 @@ Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &cont
const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
auto settings = ClComponentDepthwiseConv2d::Settings();
- DWCComputeKernelInfo dwc_info;
- initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target);
+ const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right,
+ attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+ // Get the depthwise convolution compute parameters
+ auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
settings.fast_relaxed_math(
(gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
@@ -294,8 +192,13 @@ void GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch,
const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
auto settings = ClComponentDepthwiseConv2d::Settings();
- DWCComputeKernelInfo dwc_info;
- initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target);
+ const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+ attributes.pad().right,
+ attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+
+ // Get the depthwise convolution compute parameters
+ auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
.m0(dwc_info.m0)
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 3eadaee0de..3909c15352 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -23,15 +23,15 @@
*/
#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
-#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
#include "src/common/utils/Log.h"
@@ -39,137 +39,7 @@ namespace arm_compute
{
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target)
-{
- if(!export_to_cl_image(weights))
- {
- return false;
- }
-
- const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t kernel_w = weights->tensor_shape()[idx_w];
- const size_t kernel_h = weights->tensor_shape()[idx_h];
-
- if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
- {
- return false;
- }
-
- if((kernel_w == 1) && (kernel_h == 1))
- {
- return false;
- }
-
- if(depth_multiplier > 1)
- {
- if((depth_multiplier % 4) != 0)
- {
- return false;
- }
- }
-
- return true;
-}
-
-void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, const PadStrideInfo &conv_info, const Size2D &dilation,
- unsigned int depth_multiplier,
- GPUTarget gpu_target)
-{
- ARM_COMPUTE_UNUSED(input);
-
- if(!is_data_type_float(weights->data_type()))
- {
- dwc_compute_info.export_weights_to_cl_image = false;
- dwc_compute_info.n0 = (depth_multiplier == 1) ? 4 : 1;
- if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
- {
- dwc_compute_info.m0 = 2;
- }
- else
- {
- dwc_compute_info.m0 = 1;
- }
-
- return;
- }
-
- // Floating point path
-
- // First check if we can export to cl_image.
- dwc_compute_info.export_input_to_cl_image = false;
- dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target);
-
- // Set n0
- if(depth_multiplier == 1)
- {
- if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16)
- {
- dwc_compute_info.n0 = 8;
- }
- else
- {
- dwc_compute_info.n0 = 4;
- }
- }
- else
- {
- if((depth_multiplier % 4) == 0)
- {
- dwc_compute_info.n0 = 4;
- }
- else if((depth_multiplier % 2) == 0)
- {
- dwc_compute_info.n0 = 2;
- }
- else
- {
- dwc_compute_info.n0 = 1;
- }
- }
-
- dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
-
- // Set m0 only if stride_x == 1 and dilation_x == 1
- if(conv_info.stride().first == 1 && dilation.x() == 1)
- {
- const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const size_t kernel_w = weights->tensor_shape()[idx_w];
-
- if((kernel_w >= 9) || (kernel_w == 1))
- {
- dwc_compute_info.m0 = 1;
- }
- else
- {
- if(weights->data_type() == DataType::F16)
- {
- if((input->dimension(1) % 5) == 0)
- {
- dwc_compute_info.m0 = 5;
- }
- else
- {
- dwc_compute_info.m0 = 4;
- }
- }
- else
- {
- dwc_compute_info.m0 = 2;
- }
- }
- }
- else
- {
- dwc_compute_info.m0 = 1;
- }
- return;
-}
-
-} // namespace
+using namespace arm_compute::cl_dwc;
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
@@ -261,8 +131,9 @@ void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_cont
output_shifts_to_use = &_output_shifts;
}
- DWCComputeKernelInfo dwc_native_compute_info;
- initialize_dwc_native_compute_info(dwc_native_compute_info, input->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier, gpu_target);
+ // Get the depthwise convolution compute parameters
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
@@ -346,8 +217,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
- DWCComputeKernelInfo dwc_native_compute_info;
- initialize_dwc_native_compute_info(dwc_native_compute_info, input, &permuted_weights, conv_info, dilation, depth_multiplier, gpu_target);
+ // Get the depthwise convolution compute parameters
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
@@ -355,8 +227,9 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
}
else
{
- DWCComputeKernelInfo dwc_native_compute_info;
- initialize_dwc_native_compute_info(dwc_native_compute_info, input, weights, conv_info, dilation, depth_multiplier, gpu_target);
+ // Get the depthwise convolution compute parameters
+ auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+ const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
&output_multipliers_shifts_info));
}
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
new file mode 100644
index 0000000000..f55685ee49
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+namespace
+{
+DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier, bool is_g71)
+{
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+
+ if(is_g71)
+ {
+ desc.export_weights_to_cl_image = false;
+ }
+ else
+ {
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+ }
+
+ if(depth_multiplier == 1)
+ {
+ desc.n0 = 4;
+ }
+ else
+ {
+ if((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if(conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ desc.m0 = 2;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier, bool is_g71)
+{
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ // Src and weights have the same dimension indices
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape src_shape = src->tensor_shape();
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t src_w = src_shape[idx_w];
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+
+ if(is_g71)
+ {
+ desc.export_weights_to_cl_image = false;
+ }
+ else
+ {
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+ }
+
+ if(depth_multiplier == 1)
+ {
+ if(desc.export_weights_to_cl_image == false)
+ {
+ desc.n0 = 8;
+ }
+ else
+ {
+ desc.n0 = 4;
+ }
+ }
+ else
+ {
+ if((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if(conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ if((src_w % 5) == 0)
+ {
+ desc.m0 = 5;
+ }
+ else
+ {
+ desc.m0 = 4;
+ }
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+} // namespace
+
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
+ : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
+ &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+ &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
+ &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+ &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch(_target)
+ {
+ case GPUTarget::G71:
+ func = configs_G71.get_function(src->data_type());
+ break;
+ default:
+ func = configs_G7x.get_function(src->data_type());
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+ return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_UNUSED(wei);
+
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = false;
+ desc.n0 = (depth_multiplier == 1) ? 4 : 1;
+ if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+ {
+ desc.m0 = 2;
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
new file mode 100644
index 0000000000..cec2cae5dd
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Bifrost based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
+
+ // Inherited overridden method
+ DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier) override;
+
+private:
+ DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
new file mode 100644
index 0000000000..49485c83a9
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu)
+ : IClDWCNativeKernelConfig(gpu)
+{
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
+ &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+ &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+ ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
+ &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+ &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+
+ ConfigurationFunctionExecutorPtr func = nullptr;
+ switch(_target)
+ {
+ case GPUTarget::G77:
+ func = configs_G77.get_function(src->data_type());
+ break;
+ case GPUTarget::G78:
+ default:
+ func = configs_G78.get_function(src->data_type());
+ break;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution");
+ return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+ if(depth_multiplier == 1)
+ {
+ desc.n0 = 4;
+ }
+ else
+ {
+ if((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if(conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ desc.m0 = 2;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ // Src and weights have the same dimension indices
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape src_shape = src->tensor_shape();
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t src_w = src_shape[idx_w];
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+ if(depth_multiplier == 1)
+ {
+ if(desc.export_weights_to_cl_image == false)
+ {
+ desc.n0 = 8;
+ }
+ else
+ {
+ desc.n0 = 4;
+ }
+ }
+ else
+ {
+ if((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if(conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ if((src_w % 5) == 0)
+ {
+ desc.m0 = 5;
+ }
+ else
+ {
+ desc.m0 = 4;
+ }
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ ARM_COMPUTE_UNUSED(wei);
+
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = false;
+ desc.n0 = (depth_multiplier == 1) ? 4 : 1;
+ if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+ {
+ desc.m0 = 2;
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier)
+{
+ DWCComputeKernelInfo desc;
+
+ if(src->data_layout() == DataLayout::NHWC)
+ {
+ const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+ const TensorShape wei_shape = wei->tensor_shape();
+ const size_t kernel_c = wei_shape[idx_c];
+ const size_t kernel_w = wei_shape[idx_w];
+
+ desc.export_input_to_cl_image = false;
+ desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
+
+ if(depth_multiplier == 1)
+ {
+ if(desc.export_weights_to_cl_image == false)
+ {
+ desc.n0 = 8;
+ }
+ else
+ {
+ desc.n0 = 4;
+ }
+ }
+ else
+ {
+ if((depth_multiplier % 4) == 0)
+ {
+ desc.n0 = 4;
+ }
+ else if((depth_multiplier % 2) == 0)
+ {
+ desc.n0 = 2;
+ }
+ else
+ {
+ desc.n0 = 1;
+ }
+ }
+
+ // Note: If we reduce n0, export to cl_image must be false
+ ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+
+ desc.n0 = adjust_vec_size(desc.n0, kernel_c);
+
+ // Set m0 only if stride_x == 1 and dilation_x == 1
+ if(conv_info.stride().first == 1 && dilation.x() == 1)
+ {
+ if((kernel_w >= 9) || (kernel_w == 1))
+ {
+ desc.m0 = 1;
+ }
+ else
+ {
+ desc.m0 = 2;
+ }
+ }
+ else
+ {
+ desc.m0 = 1;
+ }
+ }
+
+ return desc;
+}
+} // namespace cl_dwc
+} // namespace arm_compute
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
new file mode 100644
index 0000000000..4d51fa668c
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL
+
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Valhall based OpenCL depthwise convolution configuration */
+class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
+
+ // Inherited overridden method
+ DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier) override;
+
+private:
+ DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+ DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier);
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
new file mode 100644
index 0000000000..5593c6de61
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
+{
+ // Check whether we can use the cl image with the weights.
+ if(!export_to_cl_image(weights))
+ {
+ return false;
+ }
+
+ const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t kernel_w = weights->tensor_shape()[idx_w];
+ const size_t kernel_h = weights->tensor_shape()[idx_h];
+
+ // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
+ // 1- When the kernel size is 1x1
+ // 2- When the depth multiplier is greater than 1 and not multiple of 4.
+ if((kernel_w == 1) && (kernel_h == 1))
+ {
+ return false;
+ }
+
+ if((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+ {
+ return false;
+ }
+
+ return true;
+}
+} // namespace cl_dwc
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
new file mode 100644
index 0000000000..e3484c04ff
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensorInfo;
+
+namespace cl_dwc
+{
+/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance
+ *
+ * @param[in] weights Weights TensorInfo of the depthwise convolution
+ * @param[in] depth_multiplier Depth multiplier
+ *
+ * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance
+ */
+bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier);
+
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */
diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..c08053dcb3
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG
+
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** ClDWCNativeKernelConfigurationFactory factory class */
+class ClDWCNativeKernelConfigurationFactory final
+{
+public:
+ /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target
+ *
+ * @param[in] gpu GPU target
+ *
+ * @return IClDWCNativeKernelConfig
+ */
+ static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
+ {
+ switch(get_arch_from_target(gpu))
+ {
+ case GPUTarget::MIDGARD:
+ // The heuristic for Midgard is the same as the one used for Arm Mali-G71
+ return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71);
+ case GPUTarget::BIFROST:
+ return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu);
+ case GPUTarget::VALHALL:
+ return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu);
+ default:
+ ARM_COMPUTE_ERROR("Not supported GPU target");
+ }
+ }
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
new file mode 100644
index 0000000000..b5df132a12
--- /dev/null
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+
+namespace arm_compute
+{
+namespace cl_dwc
+{
+/** Basic container for the OpenCL depthwise convolution configuration functions */
+template <class T>
+class ClDWCNativeConfigArray
+{
+public:
+ /** Alias for F32 index */
+ static constexpr size_t DT_F32 = 0;
+ /** Alias for F16 index */
+ static constexpr size_t DT_F16 = 1;
+ /** Alias for Int8 index */
+ static constexpr size_t DT_INT8 = 2;
+
+ /** Constructor
+ *
+ * @param[in] func_f32 Function to call for depthwise convolution F32
+ * @param[in] func_f16 Function to call for depthwise convolution F16
+ * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+ *
+ */
+ ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8)
+ : _configs{ func_f32, func_f16, func_int8 }
+ {
+ }
+
+ /** Method to return the depthwise convolution configuration function based on data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return the valid function otherwise it returns nullptr if the data type is not valid
+ */
+ T get_function(DataType data_type)
+ {
+ switch(data_type)
+ {
+ case DataType::F32:
+ return _configs.at(DT_F32);
+ case DataType::F16:
+ return _configs.at(DT_F16);
+ case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ return _configs.at(DT_INT8);
+ default:
+ return nullptr;
+ }
+ }
+
+private:
+ std::array<T, 3> _configs;
+};
+
+/** Basic interface for the depthwise convolution kernel configuration */
+class IClDWCNativeKernelConfig
+{
+public:
+ /** Constructor
+ *
+ * @param[in] arch GPU target
+ */
+ IClDWCNativeKernelConfig(GPUTarget arch)
+ : _target(arch)
+ {
+ }
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
+ /** Virtual destructor */
+ virtual ~IClDWCNativeKernelConfig() = default;
+ /** This method returns the @ref DWCComputeKernelInfo for the given inputs
+ *
+ * @param[in] src Source tensor (activation tensor)
+ * @param[in] wei Weights tensor
+ * @param[in] conv_info Convolution info
+ * @param[in] dilation Kernel dilation
+ * @param[in] depth_multiplier Output feature maps multiplier
+ */
+ virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+ unsigned int depth_multiplier) = 0;
+
+protected:
+ GPUTarget _target;
+};
+} // namespace cl_dwc
+} // namespace arm_compute
+#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
index 5d3dbf3146..990f050112 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
@@ -157,5 +157,5 @@ DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f1
return desc;
}
-} // namespace opencl
+} // namespace cl_indirect_conv
} // namespace arm_compute
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index cd9a6a5c37..68dca91885 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
@@ -47,6 +47,6 @@ private:
DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
};
-} // namespace opencl
+} // namespace cl_indirect_conv
} // namespace arm_compute
#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */
diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index 13716efb5f..73fbb87560 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
@@ -56,6 +56,6 @@ public:
}
}
};
-} // namespace opencl
+} // namespace cl_indirect_conv
} // namespace arm_compute
#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG */
diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
index e722488c3b..d2f4cde662 100644
--- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
@@ -45,8 +45,8 @@ public:
/** Constructor
*
- * @param[in] func_f32 Function to call for direct convolution F32
- * @param[in] func_f16 Function to call for direct convolution F16
+ * @param[in] func_f32 Function to call for indirect convolution F32
+ * @param[in] func_f16 Function to call for indirect convolution F16
*
*/
ClIndirectConvConfigArray(T func_f32, T func_f16)
@@ -103,6 +103,6 @@ public:
protected:
GPUTarget _target;
};
-} // namespace opencl
+} // namespace cl_indirect_conv
} // namespace arm_compute
#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */