/* * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #pragma once #include "winograd.hpp" #include #include namespace arm_conv { namespace winograd { namespace weight_transform { /* Driver class for the Winograd weight transforms. */ template class Transform : public ITransform { using Kernel = std::function; const std::string m_name; const unsigned int m_kernel_rows, m_kernel_cols; const unsigned int m_transformed_tile_rows, m_transformed_tile_cols; const Kernel m_kernel; void execute_internal( const ConvolutionArgs &args, const TIn *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, TOut *outptr, size_t ld_out_matrix, size_t ld_out_row, unsigned int thread_id, unsigned int n_threads ) const { // Stripe groups of input channels over threads, this should reduce false // sharing of the output matrix. constexpr auto n_input_channels_per_thread = 16u; // Get the initial offset for the input and output pointers const auto offset = thread_id * n_input_channels_per_thread; inptr += offset * ld_input_channel; outptr += offset * ld_out_row; for (auto start_ic = thread_id * n_input_channels_per_thread; start_ic < args.n_input_channels; start_ic += n_threads * n_input_channels_per_thread) { // Now iterate over the input channels assigned to this thread. const auto end_ic = std::min(args.n_input_channels, start_ic + n_input_channels_per_thread); for (auto ic = start_ic; ic < end_ic; ic++) { m_kernel(args.n_output_channels, inptr, ld_in_row, ld_in_col, outptr, ld_out_matrix); inptr += ld_input_channel; outptr += ld_out_row; } // Progress the pointers to the account for the work not performed by // this thread. const auto skip = (n_threads - 1) * n_input_channels_per_thread; inptr += skip * ld_input_channel; outptr += skip * ld_out_row; } } public: Transform( const std::string &name, unsigned int kernel_rows, unsigned int kernel_cols, unsigned int transformed_tile_rows, unsigned int transformed_tile_cols, const Kernel kernel ) : m_name(name), m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols), m_transformed_tile_rows(transformed_tile_rows), m_transformed_tile_cols(transformed_tile_cols), m_kernel(kernel) { } const std::string &get_name(void) const override { return m_name; } unsigned int get_kernel_rows(void) const override { return m_kernel_rows; } unsigned int get_kernel_cols(void) const override { return m_kernel_cols; } unsigned int get_transformed_tile_rows(void) const override { return m_transformed_tile_rows; } unsigned int get_transformed_tile_cols(void) const override { return m_transformed_tile_cols; } void execute( const ConvolutionArgs &args, const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, void *outptr, size_t ld_out_matrix, size_t ld_out_row, unsigned int thread_id, unsigned int n_threads ) const override { execute_internal( args, reinterpret_cast(inptr), ld_in_row, ld_in_col, ld_input_channel, reinterpret_cast(outptr), ld_out_matrix, ld_out_row, thread_id, n_threads ); } /* Utility method to get a transposed variant of a kernel, this transposed * version simply calls the original kernel with the input row and column * strides swapped. */ static constexpr Kernel get_transposed_kernel(const Kernel &kernel) { return [kernel] ( const unsigned int n_channels, const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col, TOut *const outptr, const size_t ld_out ) { kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out); }; } }; } // namespace weight_transform } // namespace winograd } // namespace arm_conv