aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/output_transform.hpp')
-rw-r--r--src/core/NEON/kernels/convolution/winograd/output_transform.hpp302
1 files changed, 302 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transform.hpp b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
new file mode 100644
index 0000000000..971cc99cd2
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "winograd.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <limits>
+
+namespace arm_conv {
+namespace winograd {
+namespace output_transform {
+
+/* Driver class for the Winograd output transforms.
+ *
+ * This provides a base implementation which handles iteration over the output
+ * tensor; subclasses are responsible for managing working space and executing
+ * the transform on individual tiles.
+ */
+template <typename TIn, typename TOut=TIn>
+class TransformBase : public ITransform
+{
+ const std::string m_name;
+ const unsigned int m_output_rows, m_output_cols;
+ const unsigned int m_kernel_rows, m_kernel_cols;
+
+ protected:
+ virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const
+ {
+ return 0;
+ }
+
+ virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const
+ {
+ // Nothing to do
+ }
+
+ virtual void execute_tile(
+ unsigned int n_channels,
+ const TIn *inptr, size_t ld_in_matrix,
+ const TIn *bias,
+ TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+ TOut activation_min, TOut activation_max,
+ unsigned int valid_rows, unsigned int valid_cols,
+ void *working_space
+ ) const = 0;
+
+ void execute_internal(
+ const ConvolutionArgs &args,
+ const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
+ const TIn *bias,
+ TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
+ void *working_space, unsigned int thread_id, unsigned int n_threads
+ ) const
+ {
+ // Get the working space for this thread, and initialise it.
+ working_space = reinterpret_cast<char *>(working_space) +
+ this->get_working_space_per_thread(args) * thread_id;
+ this->initialise_thread_working_space(args, working_space);
+
+ // Get the activation values
+ auto activation_min = static_cast<TOut>(-std::numeric_limits<float>::infinity());
+ auto activation_max = static_cast<TOut>(+std::numeric_limits<float>::infinity());
+ switch (args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ activation_max = static_cast<TOut>(args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ activation_min = static_cast<TOut>(0);
+ break;
+ default:
+ break;
+ }
+
+ // Determine the number of tiles in a row, we use this to get the right
+ // offset into the input data.
+ const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols();
+
+ // Execute over all batches
+ for (unsigned int batch = 0; batch < args.n_batches; batch++)
+ {
+ auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row;
+ auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows();
+ inptr += ld_in_batch;
+ outptr += ld_out_batch;
+
+ // Stripe rows of tiles over threads.
+ for (auto out_i = thread_id * this->get_output_rows();
+ out_i < args.output_shape.rows;
+ out_i += n_threads * this->get_output_rows())
+ {
+ auto inptr_tile = inptr_row;
+ auto outptr_tile = outptr_row;
+ inptr_row += n_threads * n_tile_cols * ld_in_row;
+ outptr_row += n_threads * this->get_output_rows() * ld_out_row;
+
+ // Iterate over all columns
+ for (auto out_j = 0u; out_j < args.output_shape.cols;
+ out_j += this->get_output_cols())
+ {
+ // Execute the tile
+ this->execute_tile(
+ args.n_output_channels,
+ inptr_tile, ld_in_matrix,
+ bias,
+ outptr_tile, ld_out_row, ld_out_col,
+ activation_min, activation_max,
+ args.output_shape.rows - out_i, // Number of valid rows remaining
+ args.output_shape.cols - out_j, // Number of valid columns remaining
+ working_space
+ );
+
+ // Progress the pointers
+ inptr_tile += ld_in_row;
+ outptr_tile += this->get_output_cols() * ld_out_col;
+ }
+ }
+ }
+ }
+
+ public:
+ TransformBase(const std::string &name,
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols)
+ : m_name(name),
+ m_output_rows(output_rows), m_output_cols(output_cols),
+ m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols)
+ {
+ }
+
+ const std::string &get_name(void) const override { return m_name; }
+
+ unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; }
+ unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; }
+
+ unsigned int get_output_rows(void) const override final { return m_output_rows; }
+ unsigned int get_output_cols(void) const override final { return m_output_cols; }
+
+ unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; }
+ unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; }
+
+ size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override
+ {
+ return n_threads * this->get_working_space_per_thread(args);
+ }
+
+ void execute(
+ const ConvolutionArgs &args,
+ const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
+ const void *bias,
+ void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
+ void *working_space, unsigned int thread_id, unsigned int n_threads
+ ) const override
+ {
+ execute_internal(
+ args,
+ reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_matrix, ld_in_row,
+ reinterpret_cast<const TIn *>(bias),
+ reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_row, ld_out_col,
+ working_space, thread_id, n_threads
+ );
+ }
+};
+
+template <typename TIn, typename TOut=TIn>
+class TransformUnpadded : public TransformBase<TIn, TOut>
+{
+ using Kernel = std::function<void(
+ unsigned int n_channels,
+ const TIn *inptr, size_t ld_in_matrix,
+ const TIn *bias,
+ TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+ TOut activation_min, TOut activation_max
+ )>;
+ const Kernel m_kernel;
+
+ protected:
+ size_t get_working_space_per_thread(const ConvolutionArgs &args) const override
+ {
+ // We create a buffer the size of the output tile
+ const auto n_output_points = this->get_output_rows() * this->get_output_cols();
+ return sizeof(TOut) * n_output_points * args.n_output_channels;
+ }
+
+ void execute_tile(
+ unsigned int n_channels,
+ const TIn *inptr, size_t ld_in_matrix,
+ const TIn *bias,
+ TOut *outptr, size_t ld_out_row, size_t ld_out_col,
+ TOut activation_min, TOut activation_max,
+ unsigned int valid_rows, unsigned int valid_cols,
+ void *working_space
+ ) const override final
+ {
+ // Get copies of the output tensor parameters
+ auto kernel_outptr = outptr;
+ auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col;
+
+ // If there's padding on either the left or the right, then we execute the
+ // kernel into the output buffer and then perform a copy.
+ if (valid_rows < this->get_output_rows() ||
+ valid_cols < this->get_output_cols())
+ {
+ // Override the kernel output parameters
+ kernel_outptr = reinterpret_cast<TOut *>(working_space);
+ kernel_ld_out_col = n_channels;
+ kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols();
+ }
+
+ // Execute the kernel
+ m_kernel(
+ n_channels,
+ inptr, ld_in_matrix,
+ bias,
+ kernel_outptr, kernel_ld_out_row, kernel_ld_out_col,
+ activation_min, activation_max
+ );
+
+ // If necessary, copy from the working space into the destination tensor.
+ if (valid_rows < this->get_output_rows() ||
+ valid_cols < this->get_output_cols())
+ {
+ const auto last_row = std::min(valid_rows, this->get_output_rows());
+ const auto last_col = std::min(valid_cols, this->get_output_cols());
+
+ for (auto i = 0u; i < last_row; i++)
+ {
+ auto patch_tile = kernel_outptr;
+ auto out_tile = outptr;
+ kernel_outptr += kernel_ld_out_row;
+ outptr += ld_out_row;
+
+ for (auto j = 0u; j < last_col; j++)
+ {
+ memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels);
+ patch_tile += kernel_ld_out_col;
+ out_tile += ld_out_col;
+ }
+ }
+ }
+ }
+
+ public:
+ TransformUnpadded(const std::string &name,
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ const Kernel kernel)
+ : TransformBase<TIn, TOut>(name, output_rows, output_cols, kernel_rows, kernel_cols),
+ m_kernel(kernel)
+ {
+ }
+
+ /* Utility method to get a transposed variant of a kernel, this transposed
+ * version simply calls the original kernel with the output row and column
+ * strides swapped.
+ */
+ static constexpr Kernel get_transposed_kernel(const Kernel &kernel)
+ {
+ return [kernel] (
+ const unsigned int n_channels,
+ const TIn *const inptr, const size_t ld_in_matrix,
+ const TIn *const bias,
+ TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col,
+ const TOut activation_min, const TOut activation_max
+ ) {
+ kernel(n_channels, inptr, ld_in_matrix, bias,
+ outptr, ld_out_col, ld_out_row,
+ activation_min, activation_max);
+ };
+ }
+};
+
+} // namespace output_transform
+} // namespace winograd
+} // namespace arm_conv