From a1f7851e2f776610019db8725c2963c36b0c85eb Mon Sep 17 00:00:00 2001 From: ramelg01 Date: Wed, 29 Jun 2022 16:28:10 +0100 Subject: Integrate new winograd APIs from MLTech Resolves: COMPMID-5400 Signed-off-by: Ramy Elgammal Change-Id: Ib4428436dd7a6e40d8b2d8a2f8dac1b079154551 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7894 Reviewed-by: Pablo Marquez Tello Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../convolution/winograd/output_transform.hpp | 302 +++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 src/core/NEON/kernels/convolution/winograd/output_transform.hpp (limited to 'src/core/NEON/kernels/convolution/winograd/output_transform.hpp') diff --git a/src/core/NEON/kernels/convolution/winograd/output_transform.hpp b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp new file mode 100644 index 0000000000..5148495608 --- /dev/null +++ b/src/core/NEON/kernels/convolution/winograd/output_transform.hpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/assembly/winograd.hpp" + +#include "src/core/NEON/kernels/arm_conv/addressing.hpp" + +#include +#include +#include +#include + +namespace arm_conv { +namespace winograd { +namespace output_transform { + +/* Driver class for the Winograd output transforms. + * + * This provides a base implementation which handles iteration over the output + * tensor; subclasses are responsible for managing working space and executing + * the transform on individual tiles. + */ +template +class TransformBase : public ITransform +{ + const std::string m_name; + const unsigned int m_output_rows, m_output_cols; + const unsigned int m_kernel_rows, m_kernel_cols; + + protected: + virtual size_t get_working_space_per_thread(const ConvolutionArgs &) const + { + return 0; + } + + virtual void initialise_thread_working_space(const ConvolutionArgs &, void *) const + { + // Nothing to do + } + + virtual void execute_tile( + unsigned int n_channels, + const TIn *inptr, size_t ld_in_matrix, + const TIn *bias, + TOut *outptr, size_t ld_out_row, size_t ld_out_col, + TOut activation_min, TOut activation_max, + unsigned int valid_rows, unsigned int valid_cols, + void *working_space + ) const = 0; + + void execute_internal( + const ConvolutionArgs &args, + const TIn *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, + const TIn *bias, + TOut *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, + void *working_space, unsigned int thread_id, unsigned int n_threads + ) const + { + // Get the working space for this thread, and initialise it. + working_space = reinterpret_cast(working_space) + + this->get_working_space_per_thread(args) * thread_id; + this->initialise_thread_working_space(args, working_space); + + // Get the activation values + auto activation_min = static_cast(-std::numeric_limits::infinity()); + auto activation_max = static_cast(+std::numeric_limits::infinity()); + switch (args.activation.type) + { + case arm_gemm::Activation::Type::BoundedReLU: + activation_max = static_cast(args.activation.param1); + // Fall through + case arm_gemm::Activation::Type::ReLU: + activation_min = static_cast(0); + break; + default: + break; + } + + // Determine the number of tiles in a row, we use this to get the right + // offset into the input data. + const auto n_tile_cols = (args.output_shape.cols + this->get_output_cols() - 1) / this->get_output_cols(); + + // Execute over all batches + for (unsigned int batch = 0; batch < args.n_batches; batch++) + { + auto inptr_row = inptr + thread_id*n_tile_cols*ld_in_row; + auto outptr_row = outptr + thread_id*ld_out_row*this->get_output_rows(); + inptr += ld_in_batch; + outptr += ld_out_batch; + + // Stripe rows of tiles over threads. + for (auto out_i = thread_id * this->get_output_rows(); + out_i < args.output_shape.rows; + out_i += n_threads * this->get_output_rows()) + { + auto inptr_tile = inptr_row; + auto outptr_tile = outptr_row; + inptr_row += n_threads * n_tile_cols * ld_in_row; + outptr_row += n_threads * this->get_output_rows() * ld_out_row; + + // Iterate over all columns + for (auto out_j = 0u; out_j < args.output_shape.cols; + out_j += this->get_output_cols()) + { + // Execute the tile + this->execute_tile( + args.n_output_channels, + inptr_tile, ld_in_matrix, + bias, + outptr_tile, ld_out_row, ld_out_col, + activation_min, activation_max, + args.output_shape.rows - out_i, // Number of valid rows remaining + args.output_shape.cols - out_j, // Number of valid columns remaining + working_space + ); + + // Progress the pointers + inptr_tile += ld_in_row; + outptr_tile += this->get_output_cols() * ld_out_col; + } + } + } + } + + public: + TransformBase(const std::string &name, + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols) + : m_name(name), + m_output_rows(output_rows), m_output_cols(output_cols), + m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols) + { + } + + const std::string &get_name(void) const override { return m_name; } + + unsigned int get_input_rows(void) const override final { return m_kernel_rows + m_output_rows - 1; } + unsigned int get_input_cols(void) const override final { return m_kernel_cols + m_output_cols - 1; } + + unsigned int get_output_rows(void) const override final { return m_output_rows; } + unsigned int get_output_cols(void) const override final { return m_output_cols; } + + unsigned int get_kernel_rows(void) const override final { return m_kernel_rows; } + unsigned int get_kernel_cols(void) const override final { return m_kernel_cols; } + + size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const override + { + return n_threads * this->get_working_space_per_thread(args); + } + + void execute( + const ConvolutionArgs &args, + const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, + const void *bias, + void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, + void *working_space, unsigned int thread_id, unsigned int n_threads + ) const override + { + execute_internal( + args, + reinterpret_cast(inptr), ld_in_batch, ld_in_matrix, ld_in_row, + reinterpret_cast(bias), + reinterpret_cast(outptr), ld_out_batch, ld_out_row, ld_out_col, + working_space, thread_id, n_threads + ); + } +}; + +template +class TransformUnpadded : public TransformBase +{ + using Kernel = std::function; + const Kernel m_kernel; + + protected: + size_t get_working_space_per_thread(const ConvolutionArgs &args) const override + { + // We create a buffer the size of the output tile + const auto n_output_points = this->get_output_rows() * this->get_output_cols(); + return sizeof(TOut) * n_output_points * args.n_output_channels; + } + + void execute_tile( + unsigned int n_channels, + const TIn *inptr, size_t ld_in_matrix, + const TIn *bias, + TOut *outptr, size_t ld_out_row, size_t ld_out_col, + TOut activation_min, TOut activation_max, + unsigned int valid_rows, unsigned int valid_cols, + void *working_space + ) const override final + { + // Get copies of the output tensor parameters + auto kernel_outptr = outptr; + auto kernel_ld_out_row = ld_out_row, kernel_ld_out_col = ld_out_col; + + // If there's padding on either the left or the right, then we execute the + // kernel into the output buffer and then perform a copy. + if (valid_rows < this->get_output_rows() || + valid_cols < this->get_output_cols()) + { + // Override the kernel output parameters + kernel_outptr = reinterpret_cast(working_space); + kernel_ld_out_col = n_channels; + kernel_ld_out_row = kernel_ld_out_col * this->get_output_cols(); + } + + // Execute the kernel + m_kernel( + n_channels, + inptr, ld_in_matrix, + bias, + kernel_outptr, kernel_ld_out_row, kernel_ld_out_col, + activation_min, activation_max + ); + + // If necessary, copy from the working space into the destination tensor. + if (valid_rows < this->get_output_rows() || + valid_cols < this->get_output_cols()) + { + const auto last_row = std::min(valid_rows, this->get_output_rows()); + const auto last_col = std::min(valid_cols, this->get_output_cols()); + + for (auto i = 0u; i < last_row; i++) + { + auto patch_tile = kernel_outptr; + auto out_tile = outptr; + kernel_outptr += kernel_ld_out_row; + outptr += ld_out_row; + + for (auto j = 0u; j < last_col; j++) + { + memcpy(out_tile, patch_tile, sizeof(TOut) * n_channels); + patch_tile += kernel_ld_out_col; + out_tile += ld_out_col; + } + } + } + } + + public: + TransformUnpadded(const std::string &name, + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols, + const Kernel kernel) + : TransformBase(name, output_rows, output_cols, kernel_rows, kernel_cols), + m_kernel(kernel) + { + } + + /* Utility method to get a transposed variant of a kernel, this transposed + * version simply calls the original kernel with the output row and column + * strides swapped. + */ + static constexpr Kernel get_transposed_kernel(const Kernel &kernel) + { + return [kernel] ( + const unsigned int n_channels, + const TIn *const inptr, const size_t ld_in_matrix, + const TIn *const bias, + TOut *const outptr, const size_t ld_out_row, const size_t ld_out_col, + const TOut activation_min, const TOut activation_max + ) { + kernel(n_channels, inptr, ld_in_matrix, bias, + outptr, ld_out_col, ld_out_row, + activation_min, activation_max); + }; + } +}; + +} // namespace output_transform +} // namespace winograd +} // namespace arm_conv -- cgit v1.2.1