diff options
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/input_transform.hpp')
-rw-r--r-- | src/core/NEON/kernels/convolution/winograd/input_transform.hpp | 384 |
1 files changed, 384 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transform.hpp b/src/core/NEON/kernels/convolution/winograd/input_transform.hpp new file mode 100644 index 0000000000..113b7ea928 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/input_transform.hpp @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "arm_compute/core/Error.h" + +#include "src/core/NEON/kernels/assembly/winograd.hpp" + +#include "src/core/NEON/kernels/arm_conv/addressing.hpp" +#include <algorithm> +#include <cstring> +#include <functional> + +namespace arm_conv { +namespace winograd { +namespace input_transform { + +namespace { + +template <typename T> +constexpr T iceildiv(const T a, const T b) +{ + return (a + b - 1) / b; +} + +} + +/* Driver class for the Winograd input transforms. + * + * This provides a base implementation which handles iteration over the input + * tensor; subclasses are responsible for managing working space and executing + * the transform on individual tiles. + */ +template <typename TIn, typename TOut=TIn> +class TransformBase : public ITransform +{ + const std::string m_name; + const unsigned int m_input_rows, m_input_cols; + + protected: + virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const + { + return 0; + } + + virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const + { + // Nothing to do + } + + virtual void execute_tile( + unsigned int n_channels, + const TIn *inptr, size_t ld_in_row, size_t ld_in_col, + TOut *outptr, size_t ld_out_matrix, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols, + void *working_space + ) const = 0; + + void execute_internal( + const ConvolutionArgs &args, + const TIn *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, + TOut *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row, + void *working_space, unsigned int thread_id, unsigned int n_threads + ) const + { + // Get the working space for this thread, and initialise it. + working_space = reinterpret_cast<char *>(working_space) + + this->get_working_space_per_thread(args) * thread_id; + this->initialise_thread_working_space(args, working_space); + + // Get tile traversal parameters + const auto tile_stride_rows = std::max(1u, m_input_rows - args.kernel_shape.rows + 1); + const auto tile_stride_cols = std::max(1u, m_input_cols - args.kernel_shape.cols + 1); + const auto n_tile_rows = iceildiv( + args.output_shape.rows, m_input_rows - args.kernel_shape.rows + 1); + const auto n_tile_cols = iceildiv( + args.output_shape.cols, m_input_cols - args.kernel_shape.cols + 1); + + // Execute over all batches + for (unsigned int batch = 0; batch < args.n_batches; batch++) + { + auto outptr_tile = outptr + thread_id * n_tile_cols * ld_out_row; + + // For a single batch, stripe the rows over the threads. + for (auto tile_i = thread_id; tile_i < n_tile_rows; tile_i += n_threads) + { + // Compute pointers and padding for this row of tiles + const auto start_i = tile_i * tile_stride_rows; + const auto pad_top = start_i < args.pad_top ? args.pad_top - start_i : 0; + const auto inptr_row = inptr + (pad_top ? 0 : start_i - args.pad_top) * ld_in_row; + const auto valid_rows = args.input_shape.rows - (pad_top ? 0 : start_i - args.pad_top); + + // Iterate over columns + for (auto tile_j = 0u; tile_j < n_tile_cols; tile_j++) + { + // Compute pointers and padding for this tile, then delegate to + // execute the kernel. + const auto start_j = tile_j * tile_stride_cols; + const auto pad_left = start_j < args.pad_left ? args.pad_left - start_j : 0; + const auto inptr_tile = inptr_row + (pad_left ? 0 : start_j - args.pad_left) * ld_in_col; + const auto valid_cols = args.input_shape.cols - (pad_left ? 0 : start_j - args.pad_left); + + this->execute_tile( + args.n_input_channels, + inptr_tile, ld_in_row, ld_in_col, + outptr_tile, ld_out_matrix, + pad_top, valid_rows, pad_left, valid_cols, + working_space + ); + outptr_tile += ld_out_row; + } + + outptr_tile += (n_threads - 1) * n_tile_cols * ld_out_row; + } + + inptr += ld_in_batch; + outptr += ld_out_batch; + } + } + + public: + TransformBase(const std::string &name, unsigned int input_rows, unsigned int input_cols) + : m_name(name), m_input_rows(input_rows), m_input_cols(input_cols) + { + } + + const std::string &get_name(void) const override { return m_name; } + + unsigned int get_input_rows(void) const override final { return m_input_rows; } + unsigned int get_input_cols(void) const override final { return m_input_cols; } + + size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override + { + return n_threads * this->get_working_space_per_thread(args); + } + + void execute( + const ConvolutionArgs &args, + const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, + void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row, + void *working_space, unsigned int thread_id, unsigned int n_threads + ) const override + { + execute_internal( + args, + reinterpret_cast<const TIn *>(inptr), ld_in_batch, ld_in_row, ld_in_col, + reinterpret_cast<TOut *>(outptr), ld_out_batch, ld_out_matrix, ld_out_row, + working_space, thread_id, n_threads + ); + } +}; + +template <typename TIn, typename TOut=TIn> +class TransformDirect : public TransformBase<TIn, TOut> +{ + using Kernel = std::function<void( + unsigned int, // Number of channels + const TIn *, size_t, size_t, // Pointer to first valid input element, row and column stride + unsigned int, unsigned int, unsigned int, unsigned int, // Top, left, bottom and right padding + TOut *, size_t // Base output pointer, stride between matrices + )>; + const Kernel m_kernel; + + protected: + void execute_tile( + unsigned int n_channels, + const TIn *inptr, size_t ld_in_row, size_t ld_in_col, + TOut *outptr, size_t ld_out_matrix, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols, + void *working_space + ) const override + { + ARM_COMPUTE_UNUSED(working_space); + const auto end_i = this->get_input_rows() - pad_top; + const auto pad_bottom = end_i < valid_rows ? 0 : end_i - valid_rows; + const auto end_j = this->get_input_cols() - pad_left; + const auto pad_right = end_j < valid_cols ? 0 : end_j - valid_cols; + + // Execute the kernel + m_kernel( + n_channels, inptr, ld_in_row, ld_in_col, + pad_top, pad_left, pad_bottom, pad_right, + outptr, ld_out_matrix + ); + } + + public: + TransformDirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel) + : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel) + { + } +}; + +template <typename TIn, typename TOut=TIn> +class TransformIndirect : public TransformBase<TIn, TOut> +{ + using Kernel = std::function<void( + unsigned int, // Number of channels + const TIn *const *, // Input pointers (one per point) + TOut *, size_t // Base output pointer, stride between matrices + )>; + const Kernel m_kernel; + + struct Workspace + { + const TIn **inptrs; + const TIn *input_buffer; + }; + + size_t sizeof_inptr_array(void) const + { + return sizeof(const TIn **) * this->get_input_rows() * this->get_input_cols(); + } + + protected: + size_t get_working_space_per_thread(const ConvolutionArgs &args) const override + { + return sizeof(Workspace) + sizeof_inptr_array() + sizeof(TIn) * args.n_input_channels; + } + + void initialise_thread_working_space(const ConvolutionArgs &args, void *buffer) const override + { + Workspace *ws = reinterpret_cast<Workspace *>(buffer); + buffer = ws + 1; + + ws->inptrs = reinterpret_cast<const TIn **>(buffer); + buffer = reinterpret_cast<char *>(buffer) + sizeof_inptr_array(); + + ws->input_buffer = reinterpret_cast<const TIn *>(buffer); + memset(buffer, 0, sizeof(TIn) * args.n_input_channels); + } + + void execute_tile( + unsigned int n_channels, + const TIn *inptr, size_t ld_in_row, size_t ld_in_col, + TOut *outptr, size_t ld_out_matrix, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols, + void *working_space + ) const override + { + // Get the working space + auto ws = reinterpret_cast<Workspace *>(working_space); + + // Construct the input pointer array based on the given arguments + fill_pointer_array<const TIn>( + ws->inptrs, this->get_input_rows(), this->get_input_cols(), + inptr, ld_in_row, ld_in_col, + ws->input_buffer, + pad_top, valid_rows, + pad_left, valid_cols + ); + + // Execute the kernel + m_kernel(n_channels, ws->inptrs, outptr, ld_out_matrix); + } + + public: + TransformIndirect(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel) + : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel) + { + } +}; + +template <typename TIn, typename TOut=TIn> +class TransformUnpadded : public TransformBase<TIn, TOut> +{ + using Kernel = std::function<void( + unsigned int, // Number of channels + const TIn *, size_t, size_t, // Pointer to first input element, row and column stride + TOut *, size_t // Base output pointer, stride between matrices + )>; + const Kernel m_kernel; + + protected: + size_t get_working_space_per_thread(const ConvolutionArgs &args) const override + { + const auto input_points = this->get_input_rows() * this->get_input_cols(); + return sizeof(TIn) * input_points * args.n_input_channels; + } + + void execute_tile( + unsigned int n_channels, + const TIn *inptr, size_t ld_in_row, size_t ld_in_col, + TOut *const outptr, const size_t ld_out_matrix, + const unsigned int pad_top, const unsigned int valid_rows, + const unsigned int pad_left, const unsigned int valid_cols, + void *const working_space + ) const override + { + // If there's any padding, then copy the valid portion of the tensor into + // the working space and reset the pointer, row and column strides to point + // at this copy of the data. + if (pad_top || valid_rows < this->get_input_rows() || + pad_left || valid_cols < this->get_input_cols()) + { + const auto patch_ld_col = n_channels; + const auto patch_ld_row = patch_ld_col * this->get_input_cols(); + auto patch = reinterpret_cast<TIn *>(working_space) + + pad_top*patch_ld_row + pad_left*patch_ld_col; + + // Fill the input patch with padding + memset(working_space, 0, sizeof(TIn) * this->get_input_rows() * patch_ld_row); + + // Determine the bounds for which to copy + const auto last_i = std::min(valid_rows + pad_top, this->get_input_rows()); + const auto last_j = std::min(valid_cols + pad_left, this->get_input_cols()); + + // Copy across the valid portion of the patch + for (auto i = pad_top; i < last_i; i++) + { + auto inptr_col = inptr; + inptr += ld_in_row; + + auto patch_col = patch; + patch += patch_ld_row; + + for (auto j = pad_left; j < last_j; j++) + { + // Perform the copy and progress both input and patch pointers + memcpy(patch_col, inptr_col, n_channels * sizeof(TIn)); + inptr_col += ld_in_col; + patch_col += patch_ld_col; + } + } + + // Override the input pointer and strides + inptr = reinterpret_cast<const TIn *>(working_space); + ld_in_col = patch_ld_col; + ld_in_row = patch_ld_row; + } + + // Call the kernel + m_kernel(n_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix); + } + + public: + TransformUnpadded(const std::string &name, unsigned int input_rows, unsigned int input_cols, Kernel kernel) + : TransformBase<TIn, TOut>(name, input_rows, input_cols), m_kernel(kernel) + { + } + + /* Utility method which can be used to get a transposed version of a kernel, + * this just calls the kernel with the input row and column strides reversed. + */ + static constexpr Kernel get_transposed_kernel(const Kernel &kernel) + { + return [kernel] ( + const unsigned int n_channels, + const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col, + TOut *const outptr, const size_t ld_out_matrix + ) { + kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out_matrix); + }; + } +}; + +} // namespace input_transform +} // namespace winograd +} // namespace arm_conv |