Enable premultiplication for depthwise convolution

with fp16 and quantized types Resolves: COMPMID-6337 Change-Id: I81542e51c9c0329f202ac8452f173b138e51a0f6 Signed-off-by: Michael Tyler <michael.tyler@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9883 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: Michael Tyler <michael.tyler@arm.com> 2023-07-07 12:01:32 +0100
committer: michael.tyler <michael.tyler@arm.com> 2023-07-13 12:21:29 +0000
commit: 4c30de056afe8680b42723b26a2241811715b989 (patch)
tree: 4f522a816a5ea1b58b51226eb685c786096f30e3 /src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
parent: c8e1617807ef1985a39d8f8f5f69c113b758494d (diff)
download: ComputeLibrary-4c30de056afe8680b42723b26a2241811715b989.tar.gz
1 files changed, 46 insertions, 8 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
index 3b76e52206..8fef6f8ae0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -66,9 +66,47 @@ namespace depthwise {
 
 namespace
 {
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  bool prefer_premultiply(const DepthwiseArgs &args) {
+    if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+    {
+      return false;
+    }
+
+    unsigned int threshold;
+
+    if (args.stride_rows == 1 && args.kernel_rows == 3)
+    {
+      threshold = 30;
+    }
+    else if (args.stride_rows == 1 && args.kernel_rows == 5)
+    {
+      threshold = 31;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 3)
+    {
+      threshold = 11;
+    }
+    else if (args.stride_rows == 2 && args.kernel_rows == 5)
+    {
+      threshold = 19;
+    } else
+    {
+      return false;
+    }
+
+    return args.channel_multiplier <= threshold;
+  }
+
   template <class Strategy>
   unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
   {
+    if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+    {
+      return std::numeric_limits<unsigned int>::max();
+    }
+
     // First-pass: compute the number of output pixels which will be computed.
     return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
            arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
@@ -90,13 +128,18 @@ namespace
           );
   }
 
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+  unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+  }
+
   unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
   unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
   {
     return std::numeric_limits<unsigned int>::max();
   }
-#endif  // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif  // defined(__aarch64__)
 }
 
 static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
@@ -108,7 +151,6 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
     DepthwiseMethod::DEPTHFIRST,
     "sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
     constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
-               has_no_channel_multiplier,
                cpu_has_sme2),
     cycle_estimate<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
@@ -120,7 +162,6 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
     DepthwiseMethod::DEPTHFIRST,
     "sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
     constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
-               has_no_channel_multiplier,
                cpu_has_sme2),
     cycle_estimate<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
@@ -132,7 +173,6 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
     DepthwiseMethod::DEPTHFIRST,
     "sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
     constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
-              has_no_channel_multiplier,
               cpu_has_sme2),
     cycle_estimate<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
@@ -144,7 +184,6 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
     DepthwiseMethod::DEPTHFIRST,
     "sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
     constraint(is_supported<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
-               has_no_channel_multiplier,
                cpu_has_sme2),
     cycle_estimate<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
@@ -156,7 +195,6 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
     DepthwiseMethod::DEPTHFIRST,
     "sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
     constraint(is_supported<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
-               has_no_channel_multiplier,
                cpu_has_sme2),
     cycle_estimate<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
@@ -291,7 +329,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
     DepthwiseMethod::DEPTHFIRST,
     "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
     constraint(cpu_has_fp16, has_channel_multiplier),
-    nullptr,
+    multiplier_cycle_estimate,
     [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
       auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
       auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
author	Michael Tyler <michael.tyler@arm.com>	2023-07-07 12:01:32 +0100
committer	michael.tyler <michael.tyler@arm.com>	2023-07-13 12:21:29 +0000
commit	4c30de056afe8680b42723b26a2241811715b989 (patch)
tree	4f522a816a5ea1b58b51226eb685c786096f30e3 /src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
parent	c8e1617807ef1985a39d8f8f5f69c113b758494d (diff)
download	ComputeLibrary-4c30de056afe8680b42723b26a2241811715b989.tar.gz