Integrate SME2 kernels

* Add SME/SME2 detection. * Integrate SME2 implementation for: - Normal convolution - Winograd - Depthwise convolution - Pooling Resolves: COMPMID-5700 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: I2f1ca1d05f8cfeee9309ed1c0a36096a4a6aad5c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8692 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2022-06-01 11:47:14 +0100
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2022-11-28 16:57:42 +0000
commit: 03b2971ac69a86f10a1566938d1a25afee15746c (patch)
tree: aec7cfc047e1da278b4b71a706cda7b1b0faa158 /src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
parent: 7dc0234331f2150a6b4ac5c2b49de419870f7cf5 (diff)
download: ComputeLibrary-03b2971ac69a86f10a1566938d1a25afee15746c.tar.gz
1 files changed, 190 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
index 643cf1d460..09ee983907 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -32,8 +32,27 @@
 
 #include "depthwise_implementation_constraints.hpp"
 
+#include "interleaves/list.hpp"
+
 #if defined(__aarch64__)
 #if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
+
+#include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
+
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
+
 #include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
 #include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
@@ -72,6 +91,18 @@ namespace
           );
   }
 
+  template <class Strategy>
+  unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+  {
+    // First-pass: compute the number of output pixels which will be computed.
+    return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+           args.output_cols *
+           arm_gemm::iceildiv(
+            (long unsigned) args.input_channels * args.channel_multiplier,
+            arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+          );
+  }
+
 #if defined(__aarch64__)
   unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
   {
@@ -89,6 +120,165 @@ namespace
 static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
 #if defined(__aarch64__)
 #if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
+    constraint(fast_mode_enabled,
+               cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_3x3_s1_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
+      // Heuristic, don't prefer this kernel unless the input plane is greater
+      // than the number of channels.
+      if (args.input_rows * args.input_cols < args.input_channels)
+        return UINT32_MAX;
+
+      return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
+    },
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_3x3_s2_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_5x5_s1_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::PLANAR,
+    "sme2_fp32_planar_5x5_s2_4rows_mla_za",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
+               has_no_channel_multiplier, no_prime_right_pad),
+    nullptr,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
+      return new DepthwisePlanar<float>(strat, args);
+    },
+  },
+
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+    constraint(cpu_has_sme,  cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+  {
+    DepthwiseMethod::DEPTHFIRST,
+    "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+    constraint(cpu_has_sme, cpu_has_sme2,
+               is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+               has_no_channel_multiplier),
+    cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+    [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+      auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+      return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+    },
+  },
+#endif  // defined(ARM_COMPUTE_ENABLE_SME2)
   {
     DepthwiseMethod::DEPTHFIRST,
     "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2022-06-01 11:47:14 +0100
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2022-11-28 16:57:42 +0000
commit	03b2971ac69a86f10a1566938d1a25afee15746c (patch)
tree	aec7cfc047e1da278b4b71a706cda7b1b0faa158 /src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
parent	7dc0234331f2150a6b4ac5c2b49de419870f7cf5 (diff)
download	ComputeLibrary-03b2971ac69a86f10a1566938d1a25afee15746c.tar.gz