aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution/winograd/winograd.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/winograd.hpp')
-rw-r--r--src/core/NEON/kernels/convolution/winograd/winograd.hpp621
1 files changed, 0 insertions, 621 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd.hpp b/src/core/NEON/kernels/convolution/winograd/winograd.hpp
deleted file mode 100644
index ac82e7b7b9..0000000000
--- a/src/core/NEON/kernels/convolution/winograd/winograd.hpp
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "arm_gemm.hpp"
-
-#include <cstddef>
-#include <utility>
-
-namespace winograd
-{
-
-class ITransform
-{
- public:
- virtual ~ITransform() = default;
-
- /**
- * Get the working space required to perform the transformation.
- *
- * Note, the working space is only required when performing the
- * transformation - hence it can be reused whenever the transformation is
- * not running.
- *
- * @param nthreads The greatest number of threads that will be used to execute the transform.
- * @return Size of working space required in bytes.
- */
- virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-
- /**
- * Set the working space to be used by the transformation.
- *
- * Note, the working space is only required when performing the
- * transformation - hence it can be reused whenever the transformation is
- * not running.
- *
- * @param Pointer to the working space.
- */
- virtual void set_working_space(void *buffer) = 0;
-
- /**
- * Get the window of work a given operator can perform.
- */
- virtual unsigned int get_window() const = 0;
-
- /**
- * Perform work upon a window of the transform.
- */
- virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0;
-};
-
-class IInputTransform : public ITransform
-{
- public:
- virtual ~IInputTransform() = default;
-
- /**
- * Set the pointer to the (NHWC-ordered) tensor to be transformed.
- */
- virtual void set_input_tensor(const void *input) = 0;
-
- /**
- * Set the pointer to the (NHWC-ordered) tensor to be transformed.
- * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
- */
- virtual void set_input_tensor(const void *input, int col_stride) = 0;
-
- /**
- * Set the pointer to the (NHWC-ordered) tensor to be transformed.
- * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
- * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
- */
- virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0;
-
- /**
- * Set the pointer to the (NHWC-ordered) tensor to be transformed.
- * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
- * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
- * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
- */
- virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0;
-
- /**
- * Set pointers to the matrices written by the transform.
- * @param matrices Pointer to the start of the first matrix representing the transformed input.
- * @param inter_matrix_stride Stride (in elements) between matrices.
- * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
- */
- virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-class IOutputTransform : public ITransform
-{
- public:
- virtual ~IOutputTransform() = default;
-
- /**
- * Set pointers to the matrices written by the transform.
- * @param matrices Pointer to the start of the first matrix representing the input to the transform.
- * @param inter_matrix_stride Stride (in elements) between matrices.
- * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
- */
- virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-
- /**
- * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias.
- */
- virtual void set_bias(const void *bias=nullptr) = 0;
-
- /**
- * Set pointer to the output tensor produced by the transform.
- */
- virtual void set_output_tensor(void *output) = 0;
-
- /**
- * Set pointer to the output tensor produced by the transform.
- * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
- */
- virtual void set_output_tensor(void *output, int col_stride) = 0;
-
- /**
- * Set pointer to the output tensor produced by the transform.
- * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
- * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
- */
- virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0;
-
- /**
- * Set pointer to the output tensor produced by the transform.
- * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
- * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
- * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
- */
- virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0;
-};
-
-class IWeightTransform : public ITransform
-{
- public:
- virtual ~IWeightTransform() = default;
-
- /** Set pointer to the weight tensor read by the transform. */
- virtual void set_weight_tensor(const void *weights) = 0;
-
- /**
- * Set pointers to the matrices written by the transform.
- * @param matrices Pointer to the start of the first matrix representing the transformed input.
- * @param inter_matrix_stride Stride (in elements) between matrices.
- * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
- */
- virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-enum class WinogradRoots
-{
- Integers,
-};
-
-template <int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform : public IInputTransform
-{
- public:
- /** Create an InputTransform operator fixed on a given problem and set of
- * pointers.
- */
- InputTransform(
- int kernel_rows, /**< Number of rows in the kernel */
- int kernel_cols, /**< Number of columns in the kernel */
- int n_batches, /**< Number of batches in input tensor. */
- int n_rows, /**< Number of rows in input tensor. */
- int n_cols, /**< Number of columns in input tensor. */
- int n_channels, /**< Number of channels in input tensor. */
- int padding_top, /**< Padding to apply to the top of the image. */
- int padding_left, /**< Padding to apply to the left of the image. */
- int padding_bottom, /**< Padding to apply to the bottom of the image. */
- int padding_right /**< Padding to apply to the right of the image. */
- );
-
- InputTransform(InputTransform&) = delete;
- InputTransform operator=(InputTransform&) = delete;
-
- /** Set pointers to the input tensor read by the transform. */
- void set_input_tensor(const void *input) override;
- void set_input_tensor(const void *input, int col_stride) override;
- void set_input_tensor(const void *input, int row_stride, int col_stride) override;
- void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-
- /** Set pointers to the matrices written by the transform. */
- void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
- /** Get the working space required to perform the transformation. */
- size_t get_working_space_size(unsigned int nthreads=1) const override;
- void set_working_space(void *buffer) override;
-
- /** Get the window of work a given operator can perform. */
- unsigned int get_window() const override;
- static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window
-
- /** Perform work upon a window of the input. */
- void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
- protected:
- const int _n_batches, _n_rows, _n_cols, _n_channels;
-
- private:
- void transform_unpadded_tile(
- unsigned int threadid,
- int n_channels,
- TOut *outptr,
- const TIn *inptr
- );
-
- void transform_padded_tile(
- unsigned int threadid,
- int n_channels,
- TOut *outptr,
- const TIn *inptr,
- int padding_top,
- int padding_left,
- int padding_bottom,
- int padding_right
- );
-
- /* Tile implementation */
- static void transform_tile(
- int n_channels, /** @param[in] Number of channels in the tensor. */
- const TIn* inptr_base, /** @param[in] Pointer to the base of the input tile. */
- int input_row_stride, /** @param[in] Stride between rows of the input tensor. */
- int input_col_stride, /** @param[in] Stride between columns of the input tensor. */
- TOut* mptr_base, /** @param[out] Base pointer to transformed input matrices. */
- int matrix_stride /** @param[in] Stride between matrices in the input space. */
- );
-
- /** Get the working space for a thread. */
- void * get_working_space(unsigned int threadid) const;
-
- const TIn* _inptr;
- TOut* _outptr;
-
- const int _overlap_rows, _overlap_cols;
- const int _padding_top, _padding_left, _padding_bottom, _padding_right;
- const int _tiles_M, _tiles_N;
- int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
- int _in_col_stride, _in_row_stride, _in_batch_stride;
-
- const int _working_space_col_stride, _working_space_row_stride;
- TIn *_working_space;
-};
-
-template <int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform<InnerTileRows, 1, TIn, TOut, Roots> :
- public InputTransform<1, InnerTileRows, TIn, TOut, Roots>
-{
- using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>;
-
- public:
- InputTransform(
- int kernel_rows, /**< Number of rows in the kernel. */
- int kernel_cols, /**< Number of columns in the kernel. */
- int n_batches, /**< Number of batches in input tensor. */
- int n_rows, /**< Number of rows in input tensor. */
- int n_cols, /**< Number of columns in input tensor. */
- int n_channels, /**< Number of channels in input tensor. */
- int padding_top, /**< Padding to apply to the top of the image. */
- int padding_left, /**< Padding to apply to the left of the image. */
- int padding_bottom, /**< Padding to apply to the bottom of the image. */
- int padding_right /**< Padding to apply to the right of the image. */
- );
-
- /** Set pointers to the input tensor read by the transform. */
- void set_input_tensor(const void *input) override;
- void set_input_tensor(const void *input, int col_stride) override;
- void set_input_tensor(const void *input, int row_stride, int col_stride) override;
- void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
- int KernelRows, int KernelCols,
- int InnerTileRows, int InnerTileCols,
- typename TIn, typename TOut,
- WinogradRoots Roots
->
-class OutputTransform : public IOutputTransform
-{
- public:
- OutputTransform(
- int n_batches, /**< Number of batches in output tensor. */
- int n_rows, /**< Number of rows in output tensor. */
- int n_cols, /**< Number of columns in output tensor. */
- int n_channels, /**< Number of channels in output tensor. */
- const arm_gemm::Activation &activation
- );
-
- OutputTransform(OutputTransform&) = delete;
- OutputTransform operator=(OutputTransform&) = delete;
-
- /** Set pointers to the matrices read by the transform. */
- void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
- /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */
- void set_bias(const void *bias=nullptr) override;
-
- /** Set pointers to the output tensor written by the transform. */
- void set_output_tensor(void *output) override;
- void set_output_tensor(void *output, int col_stride) override;
- void set_output_tensor(void *output, int row_stride, int col_stride) override;
- void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-
- /** Get the working space required to perform the transformation. */
- size_t get_working_space_size(unsigned int nthreads=1) const override;
- void set_working_space(void *buffer) override;
-
- /** Get the window of work a given operator can perform. */
- unsigned int get_window() const override;
- static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window
-
- /** Perform work upon a window of the input. */
- void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
- protected:
- static constexpr int inner_tile_rows = InnerTileRows;
- static constexpr int inner_tile_cols = InnerTileCols;
- static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1;
- static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1;
-
- const int _n_batches, _n_rows, _n_cols, _n_channels;
- const TOut _output_min, _output_max;
-
- private:
- void transform_uncropped_tile(
- unsigned int threadid,
- int n_channels,
- TOut *outptr,
- const TIn *inptr,
- const TOut *biases
- );
-
- void transform_cropped_tile(
- unsigned int threadid,
- int n_channels,
- TOut *outptr,
- const TIn *inptr,
- const TOut *biases,
- int pad_bottom,
- int pad_right
- );
-
- /** Implementation of the tile transformation method. */
- static void transform_tile(
- int n_channels,
- const TIn* matrix_base,
- int matrix_stride,
- const TOut* biases,
- TOut* output,
- int output_row_stride,
- int output_col_stride,
- TOut output_min,
- TOut output_max
- );
-
- /** Get the working space for a thread. */
- void * get_working_space(unsigned int threadid) const;
-
- const TIn* _matrix_base;
- const TOut* _biases;
- int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
- TOut* _outptr;
- const int _tiles_M, _tiles_N;
- int _out_col_stride, _out_row_stride, _out_batch_stride;
-
- const int _working_space_col_stride, _working_space_row_stride;
- TOut *_working_space;
-};
-
-template <
- int KernelRows,
- int InnerTileRows,
- typename TIn, typename TOut,
- WinogradRoots Roots
->
-class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
- public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
- using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>;
-
- public:
- OutputTransform(
- int n_batches, /**< Number of batches in output tensor. */
- int n_rows, /**< Number of rows in output tensor. */
- int n_cols, /**< Number of columns in output tensor. */
- int n_channels, /**< Number of channels in output tensor. */
- const arm_gemm::Activation &activation
- );
-
- /** Set pointers to the output tensor written by the transform. */
- void set_output_tensor(void *output) override;
- void set_output_tensor(void *output, int col_stride) override;
- void set_output_tensor(void *output, int row_stride, int col_stride) override;
- void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
- int KernelRows, int KernelCols,
- int InnerTileRows, int InnerTileCols,
- typename TIn, typename TOut,
- WinogradRoots Roots
->
-class WeightTransform : public IWeightTransform
-{
- public:
- WeightTransform(
- int n_output_channels, /**< Number of output channels in the kernel. */
- int n_input_channels /**< Number of input channels in the kernel. */
- );
-
- WeightTransform(WeightTransform&) = delete;
- WeightTransform operator=(WeightTransform&) = delete;
-
- /** Set pointer to the weight tensor read by the transform. */
- void set_weight_tensor(const void *weights) override;
-
- /** Set pointer to the matrices written by the transform. */
- void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override;
-
- /** Get the working space required to perform the transformation. */
- size_t get_working_space_size(unsigned int nthreads=1) const override;
- void set_working_space(void *buffer) override;
-
- /** Get the window of work a given operator can perform. */
- unsigned int get_window() const override;
- static constexpr unsigned int WINDOW_BLOCK = 16; // Base size of window
-
- /** Perform work upon a window of the input. */
- void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
- protected:
- static const int kernel_rows = KernelRows;
- static const int kernel_cols = KernelCols;
- static const int inner_tile_rows = InnerTileRows;
- static const int inner_tile_cols = InnerTileCols;
-
- private:
- /** Apply the transform to a tensor. */
- static void execute(
- int n_output_channels,
- int n_input_channels,
- const TIn* input,
- TOut* output,
- int matrix_stride,
- int matrix_row_stride
- );
-
- const int _n_output_channels, _n_input_channels;
- TOut *_matrices;
- int _matrix_stride, _matrix_row_stride;
- const TIn *_weights;
-};
-
-template <int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class WeightTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
- public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
- public:
- using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform;
-};
-
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, WinogradRoots Roots>
-class WinogradGEMM
-{
- public:
- // Information about the specific Winograd instance
- static constexpr int output_tile_rows = OutputTileRows;
- static constexpr int output_tile_cols = OutputTileCols;
- static constexpr int kernel_rows = KernelRows;
- static constexpr int kernel_cols = KernelCols;
- static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;
- static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;
- static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols;
-
- /** Transform weights from the spatial to the Winograd domain. */
- template <typename TIn, typename TOut>
- using WeightsTransform = WeightTransform<
- KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
- TIn, TOut, Roots
- >;
-
- /** Transform input feature maps from the spatial to the Winograd domain.
- */
- template <typename TIn, typename TOut>
- using InputTransform = InputTransform<
- inner_tile_rows, inner_tile_cols, TIn, TOut, Roots
- >;
-
- /** Transform output feature maps from the Winograd to the spatial domain.
- */
- template <typename TIn, typename TOut>
- using OutputTransform = OutputTransform<
- KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
- TIn, TOut, Roots
- >;
-
- /** Perform a convolution.
- */
- template <typename TOut, typename TIn, typename TInGEMM=TIn, typename TOutGEMM=TOut>
- class Convolution
- {
- public:
- // Information about the typed Winograd instance
- typedef TOut OutputType;
- typedef TOutGEMM GemmOutputType;
- typedef TInGEMM GemmInputType;
- typedef TIn InputType;
-
- /** Get the output shape of a convolution. */
- static std::pair<unsigned int, unsigned int> get_output_shape(
- const std::pair<unsigned int, unsigned int> input_shape,
- bool padding_same);
-
- /** Get the memory required to store the kernel transformed into the
- * Winograd domain.
- */
- static size_t get_kernel_storage_size(unsigned int n_input_channels,
- unsigned int n_output_channels);
-
- /** Get the memory required to store the input tensor transformed into
- * the Winograd domain.
- */
- static size_t get_input_storage_size(
- unsigned int n_batches, // Number of batches
- unsigned int n_rows, // Number of input rows
- unsigned int n_cols, // Number of input columns
- unsigned int n_channels, // Number of input channels
- bool padding_same);
-
- /** Get the memory required to store the output tensor in the Winograd
- * domain.
- */
- static size_t get_output_storage_size(
- unsigned int n_batches, // Number of batches
- unsigned int n_rows, // Number of output rows
- unsigned int n_cols, // Number of output columns
- unsigned int n_channels // Number of output channels
- );
-
- /** Get the memory required to apply a Winograd operator to some input.
- */
- static size_t get_working_space_size(
- unsigned int n_batches,
- unsigned int n_rows, // Number of input rows
- unsigned int n_cols, // Number of input columns
- unsigned int n_input_channels, // Number of input channels
- unsigned int n_output_channels, // Number of output channels
- bool padding_same);
-
- /* Get the memory required by a single "input" matrix.
- */
- static size_t get_input_matrix_size(
- unsigned int n_batches, // Number of batches
- unsigned int n_rows, // Number of input rows
- unsigned int n_cols, // Number of input columns
- unsigned int n_channels, // Number of input channels
- bool padding_same);
-
- static int get_input_matrix_stride(
- unsigned int n_batches, // Number of batches
- unsigned int n_rows, // Number of input rows
- unsigned int n_cols, // Number of input columns
- unsigned int n_channels, // Number of input channels
- bool padding_same);
-
- /* Get the memory required by a single "output" matrix.
- */
- static size_t get_output_matrix_size(
- unsigned int n_batches, // Number of batches
- unsigned int n_rows, // Number of output rows
- unsigned int n_cols, // Number of output columns
- unsigned int n_channels // Number of output channels
- );
-
- static int get_output_matrix_stride(
- unsigned int n_batches, // Number of batches
- unsigned int n_rows, // Number of output rows
- unsigned int n_cols, // Number of output columns
- unsigned int n_channels // Number of output channels
- );
-
- /* Get the memory required by a single "kernel" matrix.
- */
- static size_t get_kernel_matrix_size(unsigned int n_input_channels,
- unsigned int n_output_channels);
- static int get_kernel_matrix_stride(unsigned int n_input_channels,
- unsigned int n_output_channels);
-
- static constexpr int M_BLOCK = 4; /** Size of block used by GEMM. */
- static constexpr int N_BLOCK = 16; /** Size of block used by GEMM. */
- };
-};
-
-} // namespace winograd