From a1f7851e2f776610019db8725c2963c36b0c85eb Mon Sep 17 00:00:00 2001 From: ramelg01 Date: Wed, 29 Jun 2022 16:28:10 +0100 Subject: Integrate new winograd APIs from MLTech Resolves: COMPMID-5400 Signed-off-by: Ramy Elgammal Change-Id: Ib4428436dd7a6e40d8b2d8a2f8dac1b079154551 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7894 Reviewed-by: Pablo Marquez Tello Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../NEON/kernels/convolution/winograd/winograd.hpp | 621 --------------------- 1 file changed, 621 deletions(-) delete mode 100644 src/core/NEON/kernels/convolution/winograd/winograd.hpp (limited to 'src/core/NEON/kernels/convolution/winograd/winograd.hpp') diff --git a/src/core/NEON/kernels/convolution/winograd/winograd.hpp b/src/core/NEON/kernels/convolution/winograd/winograd.hpp deleted file mode 100644 index ac82e7b7b9..0000000000 --- a/src/core/NEON/kernels/convolution/winograd/winograd.hpp +++ /dev/null @@ -1,621 +0,0 @@ -/* - * Copyright (c) 2017-2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include "arm_gemm.hpp" - -#include -#include - -namespace winograd -{ - -class ITransform -{ - public: - virtual ~ITransform() = default; - - /** - * Get the working space required to perform the transformation. - * - * Note, the working space is only required when performing the - * transformation - hence it can be reused whenever the transformation is - * not running. - * - * @param nthreads The greatest number of threads that will be used to execute the transform. - * @return Size of working space required in bytes. - */ - virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0; - - /** - * Set the working space to be used by the transformation. - * - * Note, the working space is only required when performing the - * transformation - hence it can be reused whenever the transformation is - * not running. - * - * @param Pointer to the working space. - */ - virtual void set_working_space(void *buffer) = 0; - - /** - * Get the window of work a given operator can perform. - */ - virtual unsigned int get_window() const = 0; - - /** - * Perform work upon a window of the transform. - */ - virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0; -}; - -class IInputTransform : public ITransform -{ - public: - virtual ~IInputTransform() = default; - - /** - * Set the pointer to the (NHWC-ordered) tensor to be transformed. - */ - virtual void set_input_tensor(const void *input) = 0; - - /** - * Set the pointer to the (NHWC-ordered) tensor to be transformed. - * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). - */ - virtual void set_input_tensor(const void *input, int col_stride) = 0; - - /** - * Set the pointer to the (NHWC-ordered) tensor to be transformed. - * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). - * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). - */ - virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0; - - /** - * Set the pointer to the (NHWC-ordered) tensor to be transformed. - * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes). - * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). - * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). - */ - virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0; - - /** - * Set pointers to the matrices written by the transform. - * @param matrices Pointer to the start of the first matrix representing the transformed input. - * @param inter_matrix_stride Stride (in elements) between matrices. - * @param matrix_row_stride Stride (in elements) between the rows within a single matrix. - */ - virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0; -}; - -class IOutputTransform : public ITransform -{ - public: - virtual ~IOutputTransform() = default; - - /** - * Set pointers to the matrices written by the transform. - * @param matrices Pointer to the start of the first matrix representing the input to the transform. - * @param inter_matrix_stride Stride (in elements) between matrices. - * @param matrix_row_stride Stride (in elements) between the rows within a single matrix. - */ - virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0; - - /** - * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias. - */ - virtual void set_bias(const void *bias=nullptr) = 0; - - /** - * Set pointer to the output tensor produced by the transform. - */ - virtual void set_output_tensor(void *output) = 0; - - /** - * Set pointer to the output tensor produced by the transform. - * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). - */ - virtual void set_output_tensor(void *output, int col_stride) = 0; - - /** - * Set pointer to the output tensor produced by the transform. - * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). - * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). - */ - virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0; - - /** - * Set pointer to the output tensor produced by the transform. - * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes). - * @param row_stride Stride between rows of the tensor, measured in elements (not bytes). - * @param col_stride Stride between columns of the tensor, measured in elements (not bytes). - */ - virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0; -}; - -class IWeightTransform : public ITransform -{ - public: - virtual ~IWeightTransform() = default; - - /** Set pointer to the weight tensor read by the transform. */ - virtual void set_weight_tensor(const void *weights) = 0; - - /** - * Set pointers to the matrices written by the transform. - * @param matrices Pointer to the start of the first matrix representing the transformed input. - * @param inter_matrix_stride Stride (in elements) between matrices. - * @param matrix_row_stride Stride (in elements) between the rows within a single matrix. - */ - virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0; -}; - -enum class WinogradRoots -{ - Integers, -}; - -template -class InputTransform : public IInputTransform -{ - public: - /** Create an InputTransform operator fixed on a given problem and set of - * pointers. - */ - InputTransform( - int kernel_rows, /**< Number of rows in the kernel */ - int kernel_cols, /**< Number of columns in the kernel */ - int n_batches, /**< Number of batches in input tensor. */ - int n_rows, /**< Number of rows in input tensor. */ - int n_cols, /**< Number of columns in input tensor. */ - int n_channels, /**< Number of channels in input tensor. */ - int padding_top, /**< Padding to apply to the top of the image. */ - int padding_left, /**< Padding to apply to the left of the image. */ - int padding_bottom, /**< Padding to apply to the bottom of the image. */ - int padding_right /**< Padding to apply to the right of the image. */ - ); - - InputTransform(InputTransform&) = delete; - InputTransform operator=(InputTransform&) = delete; - - /** Set pointers to the input tensor read by the transform. */ - void set_input_tensor(const void *input) override; - void set_input_tensor(const void *input, int col_stride) override; - void set_input_tensor(const void *input, int row_stride, int col_stride) override; - void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override; - - /** Set pointers to the matrices written by the transform. */ - void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override; - - /** Get the working space required to perform the transformation. */ - size_t get_working_space_size(unsigned int nthreads=1) const override; - void set_working_space(void *buffer) override; - - /** Get the window of work a given operator can perform. */ - unsigned int get_window() const override; - static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window - - /** Perform work upon a window of the input. */ - void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; - - protected: - const int _n_batches, _n_rows, _n_cols, _n_channels; - - private: - void transform_unpadded_tile( - unsigned int threadid, - int n_channels, - TOut *outptr, - const TIn *inptr - ); - - void transform_padded_tile( - unsigned int threadid, - int n_channels, - TOut *outptr, - const TIn *inptr, - int padding_top, - int padding_left, - int padding_bottom, - int padding_right - ); - - /* Tile implementation */ - static void transform_tile( - int n_channels, /** @param[in] Number of channels in the tensor. */ - const TIn* inptr_base, /** @param[in] Pointer to the base of the input tile. */ - int input_row_stride, /** @param[in] Stride between rows of the input tensor. */ - int input_col_stride, /** @param[in] Stride between columns of the input tensor. */ - TOut* mptr_base, /** @param[out] Base pointer to transformed input matrices. */ - int matrix_stride /** @param[in] Stride between matrices in the input space. */ - ); - - /** Get the working space for a thread. */ - void * get_working_space(unsigned int threadid) const; - - const TIn* _inptr; - TOut* _outptr; - - const int _overlap_rows, _overlap_cols; - const int _padding_top, _padding_left, _padding_bottom, _padding_right; - const int _tiles_M, _tiles_N; - int _matrix_stride, _matrix_row_stride, _matrix_batch_stride; - int _in_col_stride, _in_row_stride, _in_batch_stride; - - const int _working_space_col_stride, _working_space_row_stride; - TIn *_working_space; -}; - -template -class InputTransform : - public InputTransform<1, InnerTileRows, TIn, TOut, Roots> -{ - using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>; - - public: - InputTransform( - int kernel_rows, /**< Number of rows in the kernel. */ - int kernel_cols, /**< Number of columns in the kernel. */ - int n_batches, /**< Number of batches in input tensor. */ - int n_rows, /**< Number of rows in input tensor. */ - int n_cols, /**< Number of columns in input tensor. */ - int n_channels, /**< Number of channels in input tensor. */ - int padding_top, /**< Padding to apply to the top of the image. */ - int padding_left, /**< Padding to apply to the left of the image. */ - int padding_bottom, /**< Padding to apply to the bottom of the image. */ - int padding_right /**< Padding to apply to the right of the image. */ - ); - - /** Set pointers to the input tensor read by the transform. */ - void set_input_tensor(const void *input) override; - void set_input_tensor(const void *input, int col_stride) override; - void set_input_tensor(const void *input, int row_stride, int col_stride) override; - void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override; -}; - -template < - int KernelRows, int KernelCols, - int InnerTileRows, int InnerTileCols, - typename TIn, typename TOut, - WinogradRoots Roots -> -class OutputTransform : public IOutputTransform -{ - public: - OutputTransform( - int n_batches, /**< Number of batches in output tensor. */ - int n_rows, /**< Number of rows in output tensor. */ - int n_cols, /**< Number of columns in output tensor. */ - int n_channels, /**< Number of channels in output tensor. */ - const arm_gemm::Activation &activation - ); - - OutputTransform(OutputTransform&) = delete; - OutputTransform operator=(OutputTransform&) = delete; - - /** Set pointers to the matrices read by the transform. */ - void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override; - - /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */ - void set_bias(const void *bias=nullptr) override; - - /** Set pointers to the output tensor written by the transform. */ - void set_output_tensor(void *output) override; - void set_output_tensor(void *output, int col_stride) override; - void set_output_tensor(void *output, int row_stride, int col_stride) override; - void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override; - - /** Get the working space required to perform the transformation. */ - size_t get_working_space_size(unsigned int nthreads=1) const override; - void set_working_space(void *buffer) override; - - /** Get the window of work a given operator can perform. */ - unsigned int get_window() const override; - static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window - - /** Perform work upon a window of the input. */ - void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; - - protected: - static constexpr int inner_tile_rows = InnerTileRows; - static constexpr int inner_tile_cols = InnerTileCols; - static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1; - static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1; - - const int _n_batches, _n_rows, _n_cols, _n_channels; - const TOut _output_min, _output_max; - - private: - void transform_uncropped_tile( - unsigned int threadid, - int n_channels, - TOut *outptr, - const TIn *inptr, - const TOut *biases - ); - - void transform_cropped_tile( - unsigned int threadid, - int n_channels, - TOut *outptr, - const TIn *inptr, - const TOut *biases, - int pad_bottom, - int pad_right - ); - - /** Implementation of the tile transformation method. */ - static void transform_tile( - int n_channels, - const TIn* matrix_base, - int matrix_stride, - const TOut* biases, - TOut* output, - int output_row_stride, - int output_col_stride, - TOut output_min, - TOut output_max - ); - - /** Get the working space for a thread. */ - void * get_working_space(unsigned int threadid) const; - - const TIn* _matrix_base; - const TOut* _biases; - int _matrix_stride, _matrix_row_stride, _matrix_batch_stride; - TOut* _outptr; - const int _tiles_M, _tiles_N; - int _out_col_stride, _out_row_stride, _out_batch_stride; - - const int _working_space_col_stride, _working_space_row_stride; - TOut *_working_space; -}; - -template < - int KernelRows, - int InnerTileRows, - typename TIn, typename TOut, - WinogradRoots Roots -> -class OutputTransform : - public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots> -{ - using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>; - - public: - OutputTransform( - int n_batches, /**< Number of batches in output tensor. */ - int n_rows, /**< Number of rows in output tensor. */ - int n_cols, /**< Number of columns in output tensor. */ - int n_channels, /**< Number of channels in output tensor. */ - const arm_gemm::Activation &activation - ); - - /** Set pointers to the output tensor written by the transform. */ - void set_output_tensor(void *output) override; - void set_output_tensor(void *output, int col_stride) override; - void set_output_tensor(void *output, int row_stride, int col_stride) override; - void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override; -}; - -template < - int KernelRows, int KernelCols, - int InnerTileRows, int InnerTileCols, - typename TIn, typename TOut, - WinogradRoots Roots -> -class WeightTransform : public IWeightTransform -{ - public: - WeightTransform( - int n_output_channels, /**< Number of output channels in the kernel. */ - int n_input_channels /**< Number of input channels in the kernel. */ - ); - - WeightTransform(WeightTransform&) = delete; - WeightTransform operator=(WeightTransform&) = delete; - - /** Set pointer to the weight tensor read by the transform. */ - void set_weight_tensor(const void *weights) override; - - /** Set pointer to the matrices written by the transform. */ - void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override; - - /** Get the working space required to perform the transformation. */ - size_t get_working_space_size(unsigned int nthreads=1) const override; - void set_working_space(void *buffer) override; - - /** Get the window of work a given operator can perform. */ - unsigned int get_window() const override; - static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window - - /** Perform work upon a window of the input. */ - void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override; - - protected: - static const int kernel_rows = KernelRows; - static const int kernel_cols = KernelCols; - static const int inner_tile_rows = InnerTileRows; - static const int inner_tile_cols = InnerTileCols; - - private: - /** Apply the transform to a tensor. */ - static void execute( - int n_output_channels, - int n_input_channels, - const TIn* input, - TOut* output, - int matrix_stride, - int matrix_row_stride - ); - - const int _n_output_channels, _n_input_channels; - TOut *_matrices; - int _matrix_stride, _matrix_row_stride; - const TIn *_weights; -}; - -template -class WeightTransform : - public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots> -{ - public: - using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform; -}; - -template -class WinogradGEMM -{ - public: - // Information about the specific Winograd instance - static constexpr int output_tile_rows = OutputTileRows; - static constexpr int output_tile_cols = OutputTileCols; - static constexpr int kernel_rows = KernelRows; - static constexpr int kernel_cols = KernelCols; - static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1; - static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1; - static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols; - - /** Transform weights from the spatial to the Winograd domain. */ - template - using WeightsTransform = WeightTransform< - KernelRows, KernelCols, inner_tile_rows, inner_tile_cols, - TIn, TOut, Roots - >; - - /** Transform input feature maps from the spatial to the Winograd domain. - */ - template - using InputTransform = InputTransform< - inner_tile_rows, inner_tile_cols, TIn, TOut, Roots - >; - - /** Transform output feature maps from the Winograd to the spatial domain. - */ - template - using OutputTransform = OutputTransform< - KernelRows, KernelCols, inner_tile_rows, inner_tile_cols, - TIn, TOut, Roots - >; - - /** Perform a convolution. - */ - template - class Convolution - { - public: - // Information about the typed Winograd instance - typedef TOut OutputType; - typedef TOutGEMM GemmOutputType; - typedef TInGEMM GemmInputType; - typedef TIn InputType; - - /** Get the output shape of a convolution. */ - static std::pair get_output_shape( - const std::pair input_shape, - bool padding_same); - - /** Get the memory required to store the kernel transformed into the - * Winograd domain. - */ - static size_t get_kernel_storage_size(unsigned int n_input_channels, - unsigned int n_output_channels); - - /** Get the memory required to store the input tensor transformed into - * the Winograd domain. - */ - static size_t get_input_storage_size( - unsigned int n_batches, // Number of batches - unsigned int n_rows, // Number of input rows - unsigned int n_cols, // Number of input columns - unsigned int n_channels, // Number of input channels - bool padding_same); - - /** Get the memory required to store the output tensor in the Winograd - * domain. - */ - static size_t get_output_storage_size( - unsigned int n_batches, // Number of batches - unsigned int n_rows, // Number of output rows - unsigned int n_cols, // Number of output columns - unsigned int n_channels // Number of output channels - ); - - /** Get the memory required to apply a Winograd operator to some input. - */ - static size_t get_working_space_size( - unsigned int n_batches, - unsigned int n_rows, // Number of input rows - unsigned int n_cols, // Number of input columns - unsigned int n_input_channels, // Number of input channels - unsigned int n_output_channels, // Number of output channels - bool padding_same); - - /* Get the memory required by a single "input" matrix. - */ - static size_t get_input_matrix_size( - unsigned int n_batches, // Number of batches - unsigned int n_rows, // Number of input rows - unsigned int n_cols, // Number of input columns - unsigned int n_channels, // Number of input channels - bool padding_same); - - static int get_input_matrix_stride( - unsigned int n_batches, // Number of batches - unsigned int n_rows, // Number of input rows - unsigned int n_cols, // Number of input columns - unsigned int n_channels, // Number of input channels - bool padding_same); - - /* Get the memory required by a single "output" matrix. - */ - static size_t get_output_matrix_size( - unsigned int n_batches, // Number of batches - unsigned int n_rows, // Number of output rows - unsigned int n_cols, // Number of output columns - unsigned int n_channels // Number of output channels - ); - - static int get_output_matrix_stride( - unsigned int n_batches, // Number of batches - unsigned int n_rows, // Number of output rows - unsigned int n_cols, // Number of output columns - unsigned int n_channels // Number of output channels - ); - - /* Get the memory required by a single "kernel" matrix. - */ - static size_t get_kernel_matrix_size(unsigned int n_input_channels, - unsigned int n_output_channels); - static int get_kernel_matrix_stride(unsigned int n_input_channels, - unsigned int n_output_channels); - - static constexpr int M_BLOCK = 4; /** Size of block used by GEMM. */ - static constexpr int N_BLOCK = 16; /** Size of block used by GEMM. */ - }; -}; - -} // namespace winograd -- cgit v1.2.1