1 files changed, 342 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
new file mode 100644
index 0000000000..af0dd04298
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_implementations.hpp
@@ -0,0 +1,342 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+#include <memory>
+#include <string>
+
+namespace arm_conv {
+namespace winograd {
+
+enum class MethodConstraints
+{
+  None,
+  RequiresSVE  = 0x1,
+  RequiresSVE2 = 0x2,
+  RequiresSME  = 0x4,
+  RequiresSME2 = 0x8,
+  LargerShape  = 0x10, // Input tensor shape is larger than the output transform tile shape.
+};
+
+constexpr inline bool operator!(const MethodConstraints &c)
+{
+  return c == MethodConstraints::None;
+}
+
+constexpr inline MethodConstraints operator|(const MethodConstraints &a, const MethodConstraints &b)
+{
+  return static_cast<MethodConstraints>(static_cast<unsigned int>(a) | static_cast<unsigned int>(b));
+}
+
+constexpr inline MethodConstraints operator&(const MethodConstraints &a, const MethodConstraints &b)
+{
+  return static_cast<MethodConstraints>(static_cast<unsigned int>(a) & static_cast<unsigned int>(b));
+}
+
+inline bool constraints_met(const MethodConstraints &c, const CPUInfo *ci, const ConvolutionArgs &, const WinogradConfig *)
+{
+  return (
+    (!(c & MethodConstraints::RequiresSVE) || (ci->has_sve())) &&
+    (!(c & MethodConstraints::RequiresSVE2) || (ci->has_sve2())) &&
+    (!(c & MethodConstraints::RequiresSME) || (ci->has_sme())) &&
+    (!(c & MethodConstraints::RequiresSME2) || (ci->has_sme2()))
+    // Add further constraints here
+  );
+}
+
+inline bool output_transform_constraints_met(const output_transform::ITransform *transform, const MethodConstraints &c, const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg)
+{
+  return (
+    constraints_met(c, ci, conv_args, cfg) &&
+    (!(c & MethodConstraints::LargerShape) || (conv_args.input_shape.rows > transform->get_output_rows() && conv_args.input_shape.cols > transform->get_output_cols()))
+  );
+}
+
+namespace weight_transform {
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+  std::unique_ptr<const ITransform> transform;
+  MethodConstraints constraints;
+
+  TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+  : transform(transform), constraints(constraints)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+}  // namespace weight_transform
+
+namespace input_transform
+{
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+  std::unique_ptr<const ITransform> transform;
+  MethodConstraints constraints;
+
+  TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+  : transform(transform), constraints(constraints)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+}  // namespace input_transform
+
+namespace output_transform
+{
+
+template <typename TIn, typename TOut=TIn>
+struct TransformImplementation
+{
+  std::unique_ptr<const ITransform> transform;
+  MethodConstraints constraints;
+
+  TransformImplementation(const ITransform *transform, const MethodConstraints &constraints = MethodConstraints::None)
+  : transform(transform), constraints(constraints)
+  {
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+const TransformImplementation<TIn, TOut> *implementation_list(void);
+
+}  // namespace output_transform
+
+namespace{
+
+template <typename T>
+constexpr T iceildiv(T num, T den)
+{
+  return (num + den - 1) / den;
+}
+
+template <typename T>
+constexpr T iroundup(T num, T den)
+{
+  return den * iceildiv(num, den);
+}
+
+}
+
+template <typename TWeight, typename TWinogradIn>
+inline std::vector<const weight_transform::ITransform *> get_weight_transforms(
+  const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+  // Get target inner tile size
+  const auto target_inner_tile_rows = cfg->output_rows == 0 ? 0 : (conv_args.kernel_shape.rows + cfg->output_rows - 1);
+  const auto target_inner_tile_cols = cfg->output_cols == 0 ? 0 : (conv_args.kernel_shape.cols + cfg->output_cols - 1);
+
+  std::vector<const weight_transform::ITransform *> weight_transforms;
+  for (auto impl = weight_transform::implementation_list<TWeight, TWinogradIn>();
+       impl->transform.get() != nullptr; impl++)
+  {
+    // If this transform supports the requested kernel size, then add it to the
+    // list of weight transforms.
+    if (
+      constraints_met(impl->constraints, ci, conv_args,  cfg) &&
+      impl->transform->get_kernel_rows() == conv_args.kernel_shape.rows &&
+      impl->transform->get_kernel_cols() == conv_args.kernel_shape.cols &&
+      (target_inner_tile_rows == 0 || target_inner_tile_rows == impl->transform->get_transformed_tile_rows()) &&
+      (target_inner_tile_cols == 0 || target_inner_tile_cols == impl->transform->get_transformed_tile_cols()) &&
+      (cfg->weight_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->weight_transform_filter.c_str()))
+    )
+    {
+      weight_transforms.push_back(impl->transform.get());
+    }
+  }
+
+  return weight_transforms;
+}
+
+template <typename TIn, typename TWinogradIn>
+inline std::vector<const input_transform::ITransform *> get_input_transforms(
+  const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+  // Get target inner tile size
+  const auto target_inner_tile_rows = cfg->output_rows == 0 ? 0 : (conv_args.kernel_shape.rows + cfg->output_rows - 1);
+  const auto target_inner_tile_cols = cfg->output_cols == 0 ? 0 : (conv_args.kernel_shape.cols + cfg->output_cols - 1);
+
+  std::vector<const input_transform::ITransform *> input_transforms;
+  for (auto impl = input_transform::implementation_list<TIn, TWinogradIn>();
+       impl->transform.get() != nullptr; impl++)
+  {
+    if(
+      constraints_met(impl->constraints, ci, conv_args,  cfg) &&
+      (target_inner_tile_rows == 0 || target_inner_tile_rows == impl->transform->get_input_rows()) &&
+      (target_inner_tile_cols == 0 || target_inner_tile_cols == impl->transform->get_input_cols()) &&
+      (cfg->input_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->input_transform_filter.c_str()))
+    )
+    {
+      input_transforms.push_back(impl->transform.get());
+    }
+  }
+
+  return input_transforms;
+}
+
+template <typename TWinogradOut, typename TOut>
+inline std::vector<const output_transform::ITransform *> get_output_transforms(
+  const CPUInfo *ci, const ConvolutionArgs &conv_args, const WinogradConfig *cfg
+)
+{
+  std::vector<const output_transform::ITransform *> output_transforms;
+  for (auto impl = output_transform::implementation_list<TWinogradOut, TOut>();
+       impl->transform.get() != nullptr; impl++)
+  {
+    if(
+      output_transform_constraints_met(impl->transform.get(), impl->constraints, ci, conv_args,  cfg) &&
+      impl->transform->get_kernel_rows() == conv_args.kernel_shape.rows &&
+      impl->transform->get_kernel_cols() == conv_args.kernel_shape.cols &&
+      (cfg->output_rows == 0 || cfg->output_rows == impl->transform->get_output_rows()) &&
+      (cfg->output_cols == 0 || cfg->output_cols == impl->transform->get_output_cols()) &&
+      (cfg->output_transform_filter == "" || std::strstr(impl->transform->get_name().c_str(), cfg->output_transform_filter.c_str()))
+    )
+    {
+      output_transforms.push_back(impl->transform.get());
+    }
+  }
+
+  return output_transforms;
+}
+
+template <typename TIn, typename TWeight, typename TOut, typename TWinogradIn, typename TWinogradOut>
+bool get_implementation(
+  WinogradImpl &dest,  // Destination for the selected implementation
+  const CPUInfo *ci,
+  const ConvolutionArgs &conv_args,
+  int max_threads,
+  bool fast_mode,
+  const WinogradConfig *cfg,
+  const arm_gemm::GemmConfig *gemm_cfg
+)
+{
+  // Get vectors of valid weight, input and output transforms; then select the
+  // combination which produces the biggest output tile.
+  const auto weight_transforms = get_weight_transforms<TWeight, TWinogradIn>(ci, conv_args, cfg);
+  const auto input_transforms = get_input_transforms<TIn, TWinogradIn>(ci, conv_args, cfg);
+  const auto output_transforms = get_output_transforms<TWinogradOut, TOut>(ci, conv_args, cfg);
+
+  // Now attempt to select a complete set of Winograd transformations which can
+  // solve the problem. Work backwards from the output transform to find
+  // matching input implementations.
+  bool success = false;
+  for (auto output_transform = output_transforms.cbegin();
+       !success && output_transform != output_transforms.cend();
+       output_transform++)
+  {
+    // Look for matching weight transforms, if we find one then we look for
+    // matching input transforms.
+    for (auto weight_transform = weight_transforms.cbegin();
+         !success && weight_transform != weight_transforms.cend();
+         weight_transform++)
+    {
+      // If this weight transform is compatible, then look for a matching input
+      // transform
+      if ((*output_transform)->get_input_rows() == (*weight_transform)->get_transformed_tile_rows() &&
+          (*output_transform)->get_input_cols() == (*weight_transform)->get_transformed_tile_cols())
+      {
+        for (auto input_transform = input_transforms.cbegin();
+             !success && input_transform != input_transforms.cend();
+             input_transform++)
+        {
+          // If the input transform is suitable, then set the configuration and
+          // indicate success.
+          if ((*input_transform)->get_input_rows() == (*output_transform)->get_input_rows() &&
+              (*input_transform)->get_input_cols() == (*output_transform)->get_input_cols())
+          {
+            dest.output_transform = *output_transform;
+            dest.input_transform = *input_transform;
+            dest.weight_transform = *weight_transform;
+            success = true;
+          }
+        }
+      }
+    }
+  }
+
+  if (!success)
+  {
+    return false;
+  }
+
+  // If we're able to construct the Winograd elements, then specify the GEMM
+  // arguments required to perform the multiply-accumulate step of the
+  // convolution.
+  const auto n_output_row_tiles = iceildiv(conv_args.output_shape.rows, dest.output_transform->get_output_rows());
+  const auto n_output_col_tiles = iceildiv(conv_args.output_shape.cols, dest.output_transform->get_output_cols());
+  const auto n_output_patches = n_output_row_tiles * n_output_col_tiles;
+
+  const int n_multis = dest.input_transform->get_input_rows() *
+                       dest.input_transform->get_input_cols();
+
+  dest.gemm_args.reset(new arm_gemm::GemmArgs(
+    ci,
+    n_output_patches,  // M
+    conv_args.n_output_channels,  // N
+    conv_args.n_input_channels,  // K
+    1,  // K-sections
+    conv_args.n_batches,  // # Batches
+    n_multis,
+    false,  // Indirect input
+    {},  // No activation
+    max_threads,
+    false, // Not fixed format
+    fast_mode,
+    gemm_cfg
+  ));
+
+  // Also provide hints for the Winograd memory layout
+  auto &ws = dest.winograd_spec;
+  ws.weight_ld_row = iroundup(conv_args.n_output_channels, 4u);
+  ws.weight_ld_matrix = conv_args.n_input_channels * ws.weight_ld_row;
+  ws.weight_matrix_size_bytes = n_multis * ws.weight_ld_matrix * sizeof(TWinogradIn);
+
+  ws.input_ld_row = iroundup(conv_args.n_input_channels, 4u);
+  ws.input_ld_matrix = iroundup(n_output_patches, 4u) * ws.input_ld_row;
+  ws.input_ld_batch = n_multis * ws.input_ld_matrix;
+  ws.input_matrix_size_bytes = conv_args.n_batches * ws.input_ld_batch * sizeof(TWinogradIn);
+
+  ws.output_ld_row = ws.weight_ld_row;
+  ws.output_ld_matrix = n_output_patches * ws.output_ld_row;
+  ws.output_ld_batch = n_multis * ws.output_ld_matrix;
+  ws.output_matrix_size_bytes = conv_args.n_batches * ws.output_ld_batch * sizeof(TWinogradOut);
+
+  return true;
+}
+
+}  // namespace winograd
+}  // namespace arm_conv