1 files changed, 302 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transform.hpp b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
new file mode 100644
index 0000000000..971cc99cd2
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <limits>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+/* Driver class for the Winograd output transforms.
+ *
+ * This provides a base implementation which handles iteration over the output
+ * tensor; subclasses are responsible for managing working space and executing
+ * the transform on individual tiles.
+ */
+template <typename TIn, typename TOut=TIn>
+class TransformBase : public ITransform
+{
+  const std::string m_name;
+  const unsigned int m_output_rows, m_output_cols;
+  const unsigned int m_kernel_rows, m_kernel_cols;
+
+  protected:
+  virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
+  {
+    return 0;
+  }
+
+  virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
+  {
+    // Nothing to do
+  }
+
+  virtual void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_matrix,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+    TOut activation_min, TOut activation_max,
+    unsigned int valid_rows, unsigned int valid_cols,
+    void *working_space
+  ) const = 0;
+
+  void execute_internal(
+    const ConvolutionArgs &args,
+    const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
+    void *working_space, unsigned int thread_id, unsigned int n_threads
+  ) const
+  {
+    // Get the working space for this thread, and initialise it.
+    working_space = reinterpret_cast<char *>(working_space) +
+                    this->get_working_space_per_thread(args) * thread_id;
+    this->initialise_thread_working_space(args, working_space);
+
+    // Get the activation values
+    auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
+    auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
+    switch (args.activation.type)
+    {
+      case arm_gemm::Activation::Type::BoundedReLU:
+        activation_max = static_cast<TOut>(args.activation.param1);
+        // Fall through
+      case arm_gemm::Activation::Type::ReLU:
+        activation_min = static_cast<TOut>(0);
+        break;
+      default:
+        break;
+    }
+
+    // Determine the number of tiles in a row, we use this to get the right
+    // offset into the input data.
+    const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();
+
+    // Execute over all batches
+    for (unsigned int batch = 0; batch < args.n_batches; batch++)
+    {
+      auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
+      auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
+      inptr += ld_in_batch;
+      outptr += ld_out_batch;
+
+      // Stripe rows of tiles over threads.
+      for (auto out_i = thread_id * this->get_output_rows();
+           out_i < args.output_shape.rows;
+           out_i += n_threads * this->get_output_rows())
+      {
+        auto inptr_tile = inptr_row;
+        auto outptr_tile = outptr_row;
+        inptr_row += n_threads * n_tile_cols * ld_in_row;
+        outptr_row += n_threads * this->get_output_rows() * ld_out_row;
+
+        // Iterate over all columns
+        for (auto out_j = 0u; out_j < args.output_shape.cols;
+             out_j += this->get_output_cols())
+        {
+          // Execute the tile
+          this->execute_tile(
+            args.n_output_channels,
+            inptr_tile, ld_in_matrix,
+            bias,
+            outptr_tile, ld_out_row, ld_out_col,
+            activation_min, activation_max,
+            args.output_shape.rows - out_i,  // Number of valid rows remaining
+            args.output_shape.cols - out_j,  // Number of valid columns remaining
+            working_space
+          );
+
+          // Progress the pointers
+          inptr_tile += ld_in_row;
+          outptr_tile += this->get_output_cols() * ld_out_col;
+        }
+      }
+    }
+  }
+
+  public:
+  TransformBase(const std::string &name,
+                unsigned int output_rows, unsigned int output_cols,
+                unsigned int kernel_rows, unsigned int kernel_cols)
+  : m_name(name),
+    m_output_rows(output_rows), m_output_cols(output_cols),
+    m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
+  {
+  }
+
+  const std::string &get_name(void) const override { return m_name; }
+
+  unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
+  unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }
+
+  unsigned int get_output_rows(void) const override final { return m_output_rows; }
+  unsigned int get_output_cols(void) const override final { return m_output_cols; }
+
+  unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
+  unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }
+
+  size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
+  {
+    return n_threads * this->get_working_space_per_thread(args);
+  }
+
+  void execute(
+    const ConvolutionArgs &args,
+    const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
+    const void *bias,
+    void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
+    void *working_space, unsigned int thread_id, unsigned int n_threads
+  ) const override
+  {
+    execute_internal(
+      args,
+      reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
+      reinterpret_cast<const TIn *>(bias),
+      reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
+      working_space, thread_id, n_threads
+    );
+  }
+};
+
+template <typename TIn, typename TOut=TIn>
+class TransformUnpadded : public TransformBase<TIn, TOut>
+{
+  using Kernel = std::function<void(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_matrix,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+    TOut activation_min, TOut activation_max
+  )>;
+  const Kernel m_kernel;
+
+  protected:
+  size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
+  {
+    // We create a buffer the size of the output tile
+    const auto n_output_points = this->get_output_rows() * this->get_output_cols();
+    return sizeof(TOut) * n_output_points * args.n_output_channels;
+  }
+
+  void execute_tile(
+    unsigned int n_channels,
+    const TIn *inptr, size_t ld_in_matrix,
+    const TIn *bias,
+    TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+    TOut activation_min, TOut activation_max,
+    unsigned int valid_rows, unsigned int valid_cols,
+    void *working_space
+  ) const override final
+  {
+    // Get copies of the output tensor parameters
+    auto kernel_outptr = outptr;
+    auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;
+
+    // If there's padding on either the left or the right, then we execute the
+    // kernel into the output buffer and then perform a copy.
+    if (valid_rows < this->get_output_rows() ||
+        valid_cols < this->get_output_cols())
+    {
+      // Override the kernel output parameters
+      kernel_outptr = reinterpret_cast<TOut *>(working_space);
+      kernel_ld_out_col = n_channels;
+      kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
+    }
+
+    // Execute the kernel
+    m_kernel(
+      n_channels,
+      inptr, ld_in_matrix,
+      bias,
+      kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
+      activation_min, activation_max
+    );
+
+    // If necessary, copy from the working space into the destination tensor.
+    if (valid_rows < this->get_output_rows() ||
+        valid_cols < this->get_output_cols())
+    {
+      const auto last_row = std::min(valid_rows, this->get_output_rows());
+      const auto last_col = std::min(valid_cols, this->get_output_cols());
+
+      for (auto i = 0u; i < last_row; i++)
+      {
+        auto patch_tile = kernel_outptr;
+        auto out_tile = outptr;
+        kernel_outptr += kernel_ld_out_row;
+        outptr += ld_out_row;
+
+        for (auto j = 0u; j < last_col; j++)
+        {
+          memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
+          patch_tile += kernel_ld_out_col;
+          out_tile += ld_out_col;
+        }
+      }
+    }
+  }
+
+  public:
+  TransformUnpadded(const std::string &name,
+                    unsigned int output_rows, unsigned int output_cols,
+                    unsigned int kernel_rows, unsigned int kernel_cols,
+                    const Kernel kernel)
+  : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
+    m_kernel(kernel)
+  {
+  }
+
+  /* Utility method to get a transposed variant of a kernel, this transposed
+   * version simply calls the original kernel with the output row and column
+   * strides swapped.
+   */
+  static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
+  {
+    return [kernel] (
+      const unsigned int n_channels,
+      const TIn *const inptr, const size_t ld_in_matrix,
+      const TIn *const bias,
+      TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
+      const TOut activation_min, const TOut activation_max
+    ) {
+      kernel(n_channels, inptr, ld_in_matrix, bias,
+             outptr, ld_out_col, ld_out_row,
+             activation_min, activation_max);
+    };
+  }
+};
+
+}  // namespace output_transform
+}  // namespace winograd
+}  // namespace arm_conv