/* * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "winograd.hpp" #include "src/core/NEON/kernels/arm_conv/addressing.hpp" #include #include #include #include namespace arm_conv { namespace winograd { namespace output_transform { /* Driver class for the Winograd output transforms. * * This provides a base implementation which handles iteration over the output * tensor; subclasses are responsible for managing working space and executing * the transform on individual tiles. */ template class TransformBase : public ITransform { const std::string m_name; const unsigned int m_output_rows, m_output_cols; const unsigned int m_kernel_rows, m_kernel_cols; protected: virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const { return 0; } virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const { // Nothing to do } virtual void execute_tile( unsigned int n_channels, const TIn *inptr, size_t ld_in_matrix, const TIn *bias, TOut *outptr, size_t ld_out_row, size_t ld_out_col, TOut activation_min, TOut activation_max, unsigned int valid_rows, unsigned int valid_cols, void *working_space ) const = 0; void execute_internal( const ConvolutionArgs &args, const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, const TIn *bias, TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, void *working_space, unsigned int thread_id, unsigned int n_threads ) const { // Get the working space for this thread, and initialise it. working_space = reinterpret_cast(working_space) + this->get_working_space_per_thread(args) * thread_id; this->initialise_thread_working_space(args, working_space); // Get the activation values auto activation_min = static_cast(-std::numeric_limits::infinity()); auto activation_max = static_cast(+std::numeric_limits::infinity()); switch (args.activation.type) { case arm_gemm::Activation::Type::BoundedReLU: activation_max = static_cast(args.activation.param1); // Fall through case arm_gemm::Activation::Type::ReLU: activation_min = static_cast(0); break; default: break; } // Determine the number of tiles in a row, we use this to get the right // offset into the input data. const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols(); // Execute over all batches for (unsigned int batch = 0; batch < args.n_batches; batch++) { auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row; auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows(); inptr += ld_in_batch; outptr += ld_out_batch; // Stripe rows of tiles over threads. for (auto out_i = thread_id * this->get_output_rows(); out_i < args.output_shape.rows; out_i += n_threads * this->get_output_rows()) { auto inptr_tile = inptr_row; auto outptr_tile = outptr_row; inptr_row += n_threads * n_tile_cols * ld_in_row; outptr_row += n_threads * this->get_output_rows() * ld_out_row; // Iterate over all columns for (auto out_j = 0u; out_j < args.output_shape.cols; out_j += this->get_output_cols()) { // Execute the tile this->execute_tile( args.n_output_channels, inptr_tile, ld_in_matrix, bias, outptr_tile, ld_out_row, ld_out_col, activation_min, activation_max, args.output_shape.rows - out_i, // Number of valid rows remaining args.output_shape.cols - out_j, // Number of valid columns remaining working_space ); // Progress the pointers inptr_tile += ld_in_row; outptr_tile += this->get_output_cols() * ld_out_col; } } } } public: TransformBase(const std::string &name, unsigned int output_rows, unsigned int output_cols, unsigned int kernel_rows, unsigned int kernel_cols) : m_name(name), m_output_rows(output_rows), m_output_cols(output_cols), m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols) { } const std::string &get_name(void) const override { return m_name; } unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; } unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; } unsigned int get_output_rows(void) const override final { return m_output_rows; } unsigned int get_output_cols(void) const override final { return m_output_cols; } unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; } unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; } size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override { return n_threads * this->get_working_space_per_thread(args); } void execute( const ConvolutionArgs &args, const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, const void *bias, void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, void *working_space, unsigned int thread_id, unsigned int n_threads ) const override { execute_internal( args, reinterpret_cast(inptr), ld_in_batch, ld_in_matrix, ld_in_row, reinterpret_cast(bias), reinterpret_cast(outptr), ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads ); } }; template class TransformUnpadded : public TransformBase { using Kernel = std::function; const Kernel m_kernel; protected: size_t get_working_space_per_thread(const ConvolutionArgs &args) const override { // We create a buffer the size of the output tile const auto n_output_points = this->get_output_rows() * this->get_output_cols(); return sizeof(TOut) * n_output_points * args.n_output_channels; } void execute_tile( unsigned int n_channels, const TIn *inptr, size_t ld_in_matrix, const TIn *bias, TOut *outptr, size_t ld_out_row, size_t ld_out_col, TOut activation_min, TOut activation_max, unsigned int valid_rows, unsigned int valid_cols, void *working_space ) const override final { // Get copies of the output tensor parameters auto kernel_outptr = outptr; auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col; // If there's padding on either the left or the right, then we execute the // kernel into the output buffer and then perform a copy. if (valid_rows < this->get_output_rows() || valid_cols < this->get_output_cols()) { // Override the kernel output parameters kernel_outptr = reinterpret_cast(working_space); kernel_ld_out_col = n_channels; kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols(); } // Execute the kernel m_kernel( n_channels, inptr, ld_in_matrix, bias, kernel_outptr, kernel_ld_out_row, kernel_ld_out_col, activation_min, activation_max ); // If necessary, copy from the working space into the destination tensor. if (valid_rows < this->get_output_rows() || valid_cols < this->get_output_cols()) { const auto last_row = std::min(valid_rows, this->get_output_rows()); const auto last_col = std::min(valid_cols, this->get_output_cols()); for (auto i = 0u; i < last_row; i++) { auto patch_tile = kernel_outptr; auto out_tile = outptr; kernel_outptr += kernel_ld_out_row; outptr += ld_out_row; for (auto j = 0u; j < last_col; j++) { memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels); patch_tile += kernel_ld_out_col; out_tile += ld_out_col; } } } } public: TransformUnpadded(const std::string &name, unsigned int output_rows, unsigned int output_cols, unsigned int kernel_rows, unsigned int kernel_cols, const Kernel kernel) : TransformBase(name, output_rows, output_cols, kernel_rows, kernel_cols), m_kernel(kernel) { } /* Utility method to get a transposed variant of a kernel, this transposed * version simply calls the original kernel with the output row and column * strides swapped. */ static constexpr Kernel get_transposed_kernel(const Kernel &kernel) { return [kernel] ( const unsigned int n_channels, const TIn *const inptr, const size_t ld_in_matrix, const TIn *const bias, TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col, const TOut activation_min, const TOut activation_max ) { kernel(n_channels, inptr, ld_in_matrix, bias, outptr, ld_out_col, ld_out_row, activation_min, activation_max); }; } }; } // namespace output_transform } // namespace winograd } // namespace arm_conv