diff options
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/weight_transform.hpp')
-rw-r--r-- | src/core/NEON/kernels/convolution/winograd/weight_transform.hpp | 145 |
1 files changed, 145 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp new file mode 100644 index 0000000000..db0f53df1b --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/weight_transform.hpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/assembly/winograd.hpp" +#include <algorithm> +#include <functional> + +namespace arm_conv { +namespace winograd { +namespace weight_transform { + +/* Driver class for the Winograd weight transforms. + */ +template <typename TIn, typename TOut=TIn> +class Transform : public ITransform +{ + using Kernel = std::function<void( + unsigned int n_channels, // Number of channels to transform + const TIn *inptr, size_t ld_in_row, size_t ld_in_col, + TOut *outptr, size_t ld_out_matrix + )>; + + const std::string m_name; + const unsigned int m_kernel_rows, m_kernel_cols; + const unsigned int m_transformed_tile_rows, m_transformed_tile_cols; + const Kernel m_kernel; + + void execute_internal( + const ConvolutionArgs &args, + const TIn *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, + TOut *outptr, size_t ld_out_matrix, size_t ld_out_row, + unsigned int thread_id, unsigned int n_threads + ) const + { + // Stripe groups of input channels over threads, this should reduce false + // sharing of the output matrix. + constexpr auto n_input_channels_per_thread = 16u; + + // Get the initial offset for the input and output pointers + const auto offset = thread_id * n_input_channels_per_thread; + inptr += offset * ld_input_channel; + outptr += offset * ld_out_row; + + for (auto start_ic = thread_id * n_input_channels_per_thread; + start_ic < args.n_input_channels; + start_ic += n_threads * n_input_channels_per_thread) + { + // Now iterate over the input channels assigned to this thread. + const auto end_ic = std::min(args.n_input_channels, + start_ic + n_input_channels_per_thread); + for (auto ic = start_ic; ic < end_ic; ic++) + { + m_kernel(args.n_output_channels, inptr, ld_in_row, ld_in_col, + outptr, ld_out_matrix); + inptr += ld_input_channel; + outptr += ld_out_row; + } + + // Progress the pointers to the account for the work not performed by + // this thread. + const auto skip = (n_threads - 1) * n_input_channels_per_thread; + inptr += skip * ld_input_channel; + outptr += skip * ld_out_row; + } + } + + public: + Transform( + const std::string &name, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int transformed_tile_rows, unsigned int transformed_tile_cols, + const Kernel kernel + ) + : m_name(name), + m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols), + m_transformed_tile_rows(transformed_tile_rows), m_transformed_tile_cols(transformed_tile_cols), + m_kernel(kernel) + { + } + + const std::string &get_name(void) const override { return m_name; } + + unsigned int get_kernel_rows(void) const override { return m_kernel_rows; } + unsigned int get_kernel_cols(void) const override { return m_kernel_cols; } + + unsigned int get_transformed_tile_rows(void) const override { return m_transformed_tile_rows; } + unsigned int get_transformed_tile_cols(void) const override { return m_transformed_tile_cols; } + + void execute( + const ConvolutionArgs &args, + const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, + void *outptr, size_t ld_out_matrix, size_t ld_out_row, + unsigned int thread_id, unsigned int n_threads + ) const override + { + execute_internal( + args, + reinterpret_cast<const TIn *>(inptr), ld_in_row, ld_in_col, ld_input_channel, + reinterpret_cast<TOut *>(outptr), ld_out_matrix, ld_out_row, + thread_id, n_threads + ); + } + + /* Utility method to get a transposed variant of a kernel, this transposed + * version simply calls the original kernel with the input row and column + * strides swapped. + */ + static constexpr Kernel get_transposed_kernel(const Kernel &kernel) + { + return [kernel] ( + const unsigned int n_channels, + const TIn *const inptr, const size_t ld_in_row, const size_t ld_in_col, + TOut *const outptr, const size_t ld_out + ) { + kernel(n_channels, inptr, ld_in_col, ld_in_row, outptr, ld_out); + }; + } +}; + +} // namespace weight_transform +} // namespace winograd +} // namespace arm_conv |