diff options
5 files changed, 419 insertions, 576 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp index 473a13c3b0..b813bbb25c 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/transforms/input.hpp @@ -155,15 +155,15 @@ namespace winograd T* const outptr = matrix_base + tile_j*matrix_row_stride; // Apply the specific tile processing function - const int f_pad_top = iceildiv(pad_top, 2); - const int f_pad_left = iceildiv(t_pad_left, 2); - tile_fns[f_pad_top][f_pad_left][pad_bottom][t_pad_right]( + const typename Tiles::TileFn tilefn = Tiles::get_tile_specialization( + pad_top, t_pad_left, pad_bottom, t_pad_right + ); + + tilefn( n_channels, - input_base_col, - input_row_stride, - input_col_stride, - outptr, - matrix_stride + input_base_col, input_row_stride, input_col_stride, + outptr, matrix_stride, + pad_top, t_pad_left, pad_bottom, t_pad_right ); } } @@ -264,4 +264,86 @@ namespace winograd matrix_stride, matrix_batch_stride, matrix_row_stride ); } + + template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> + typename InputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>::TileFn + InputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>:: + get_tile_specialization( + const int pad_top, + const int pad_left, + const int pad_bottom, + const int pad_right + ) + { + if (!(pad_top || pad_left || pad_bottom || pad_right)) + { + // No padding, return unpadded specialisation + return tilefn_unpadded; + } + else if (pad_top && !(pad_left || pad_bottom || pad_right)) + { + // Top padding only + const int index = (pad_top - min_pad_top) / (InnerTileRows - overlap_rows); + return tilefn_top_padded[index]; + } + else if (!(pad_top) && pad_left && !(pad_bottom || pad_right)) + { + // Left padding only + const int index = (pad_left - min_pad_left) / (InnerTileCols - overlap_cols); + return tilefn_left_padded[index]; + } + else if (!(pad_top || pad_left) && pad_bottom && !(pad_right)) + { + // Bottom padding only + return tilefn_bottom_padded[pad_bottom - 1]; + } + else if (!(pad_top || pad_left || pad_bottom) && pad_right) + { + // Right padding only + return tilefn_right_padded[pad_right - 1]; + } + else + { + // Combination of paddings, return an unspecialised method + return tilefn_generic; + } + } + + template <int KernelCols, int InnerTileCols, typename T> + typename InputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>::TileFn + InputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T>:: + get_tile_specialization( + const int pad_top, + const int pad_left, + const int pad_bottom, + const int pad_right + ) + { + (void) pad_top; + (void) pad_bottom; + + if (!(pad_left || pad_right)) + { + // No padding, return unpadded specialisation + return tilefn_unpadded; + } + else if (pad_left && !pad_right) + { + // Left padding only + const int index = (pad_left - min_pad_left) / (InnerTileCols - overlap_cols); + return tilefn_left_padded[index]; + } + else if (!pad_left && pad_right) + { + // Right padding only + return tilefn_right_padded[pad_right - 1]; + } + else + { + // Combination of paddings, return an unspecialised method + return tilefn_generic; + } + } } + + diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp index abcda53534..995554d7f2 100644 --- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp +++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_input_transform.hpp @@ -31,6 +31,109 @@ namespace { template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> +class InputTransformImplTiles +{ + public: + /** Method to transform a tile of the input tensor into the Winograd domain. */ + typedef void (*TileFn)( + const int n_channels, /** @param[in] Number of channels in the tensor. */ + const T* const inptr_base, /** @param[in] Pointer to the base of the input tile. */ + const int input_row_stride, /** @param[in] Stride between rows of the input tensor. */ + const int input_col_stride, /** @param[in] Stride between columns of the input tensor. */ + T* const mptr_base, /** @param[out] Base pointer to transformed input matrices. */ + const int matrix_stride, /** @param[in] Stride between matrices in the input space. */ + const int _pad_top, /** @param[in] Top padding for unspecialised tiles. */ + const int _pad_left, /** @param[in] Left padding for unspecialised tiles. */ + const int _pad_bottom, /** @param[in] Bottom padding for unspecialised tiles. */ + const int _pad_right /** @param[in] Right padding for unspecialised tiles. */ + ); + + static TileFn get_tile_specialization( + const int pad_top, + const int pad_left, + const int pad_bottom, + const int pad_right + ); + + // Tile overlaps + static constexpr int overlap_rows = KernelRows - 1; + static constexpr int overlap_cols = KernelCols - 1; + + private: + + // Maximum padding and number of distinct paddings + static constexpr int max_pad_top = KernelRows / 2; + static constexpr int min_pad_top = KernelRows % (InnerTileRows - overlap_rows); + static constexpr int n_pad_top = iceildiv(max_pad_top, InnerTileRows - overlap_rows); + + static constexpr int max_pad_left = KernelCols / 2; + static constexpr int min_pad_left = KernelCols % (InnerTileCols - overlap_cols); + static constexpr int n_pad_left = iceildiv(max_pad_left, InnerTileCols - overlap_cols); + + static constexpr int n_pad_bottom = InnerTileRows; + static constexpr int n_pad_right = InnerTileCols; + + // Pointers to methods implementing a generically padded tile and a totally unpadded tile. + static const TileFn tilefn_generic; /** Generic tile processing function. */ + static const TileFn tilefn_unpadded; /** Tile processor for unpadded tiles. */ + + // Arrays of methods covering tiles which are padded only on a single side. + static const TileFn tilefn_top_padded[n_pad_top]; + static const TileFn tilefn_left_padded[n_pad_left]; + static const TileFn tilefn_bottom_padded[n_pad_bottom]; + static const TileFn tilefn_right_padded[n_pad_right]; +}; + + +template < int KernelCols, int InnerTileCols, typename T> +class InputTransformImplTiles<1, KernelCols, 1, InnerTileCols, T> +{ + public: + /** Method to transform a tile of the input tensor into the Winograd domain. */ + typedef void (*TileFn)( + const int n_channels, /** @param[in] Number of channels in the tensor. */ + const T* const inptr_base, /** @param[in] Pointer to the base of the input tile. */ + const int input_row_stride, /** @param[in] Stride between rows of the input tensor. */ + const int input_col_stride, /** @param[in] Stride between columns of the input tensor. */ + T* const mptr_base, /** @param[out] Base pointer to transformed input matrices. */ + const int matrix_stride, /** @param[in] Stride between matrices in the input space. */ + const int _pad_top, /** @param[in] Top padding for unspecialised tiles. */ + const int _pad_left, /** @param[in] Left padding for unspecialised tiles. */ + const int _pad_bottom, /** @param[in] Bottom padding for unspecialised tiles. */ + const int _pad_right /** @param[in] Right padding for unspecialised tiles. */ + ); + + static TileFn get_tile_specialization( + const int pad_top, + const int pad_left, + const int pad_bottom, + const int pad_right + ); + + // Tile overlaps + static constexpr int overlap_rows = 0; + static constexpr int overlap_cols = KernelCols - 1; + + private: + // Maximum padding and number of distinct paddings + static constexpr int max_pad_left = KernelCols / 2; + static constexpr int min_pad_left = KernelCols % (InnerTileCols - overlap_cols); + static constexpr int n_pad_left = iceildiv(max_pad_left, InnerTileCols - overlap_cols); + + static constexpr int n_pad_right = InnerTileCols; + + // Pointers to methods implementing a generically padded tile and a totally unpadded tile. + static const TileFn tilefn_generic; /** Generic tile processing function. */ + static const TileFn tilefn_unpadded; /** Tile processor for unpadded tiles. */ + + // Arrays of methods covering tiles which are padded only on a single side. + static const TileFn tilefn_left_padded[n_pad_left]; + static const TileFn tilefn_right_padded[n_pad_right]; +}; + + + +template <int KernelRows, int KernelCols, int InnerTileRows, int InnerTileCols, typename T> class InputTransformImpl { public: @@ -69,29 +172,13 @@ class InputTransformImpl const int n_cols ); - // Tile overlaps - static constexpr int overlap_rows = KernelRows - 1; - static constexpr int overlap_cols = KernelCols - 1; - - // Maximum padding and number of distinct paddings - static constexpr int max_pad_top = KernelRows / 2; - static constexpr int n_pad_top = 1 + iceildiv(max_pad_top, InnerTileRows - overlap_rows); + using Tiles = InputTransformImplTiles<KernelRows, KernelCols, InnerTileRows, InnerTileCols, T>; - static constexpr int max_pad_left = KernelCols / 2; - static constexpr int n_pad_left = 1 + iceildiv(max_pad_left, InnerTileCols - overlap_cols); + static constexpr int overlap_rows = Tiles::overlap_rows; + static constexpr int overlap_cols = Tiles::overlap_cols; - static constexpr int n_pad_bottom = InnerTileRows; - static constexpr int n_pad_right = InnerTileCols; - - /** Process a single tile of the input tensor. */ - template <int pad_top, int pad_left, int pad_bottom, int pad_right> - static void process_tile(int, const T*, int, int, T*, int); - // Array of methods to transform tiles of the input tensor. - typedef void (*TileFn)(int, const T*, int, int, T*, int); - static const TileFn - tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right]; -}; + }; template <int KernelRows, int InnerTileRows, typename T> diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp index 042d4debbc..e66300d39a 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp @@ -29,19 +29,30 @@ namespace { -template <int pad_top, int pad_left, int pad_bottom, int pad_right> +template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0> void winograd_input_transform_1x8_fp32_process_tile( int n_channels, const float* const input_base, const int input_row_stride, const int input_col_stride, float* const matrix_base, - const int matrix_stride + const int matrix_stride, + const int _pad_top, + const int _pad_left, + const int _pad_bottom, + const int _pad_right ) { (void) input_row_stride; // No rows over which to stride + (void) _pad_top; // Never any top padding + (void) _pad_bottom; // Never any bottom padding + + // Extract padding arguments + const int pad_left = Specialized ? PadLeft : _pad_left; + const int pad_right = Specialized ? PadRight : _pad_right; + constexpr int inner_tile_cols = 8; - constexpr int cells_j = inner_tile_cols - pad_right; + const int cells_j = inner_tile_cols - pad_right; float *outptr = matrix_base; @@ -162,109 +173,85 @@ void winograd_input_transform_1x8_fp32_process_tile( namespace winograd { template <int x> -using Transform = InputTransformImpl<1, x, 1, 8, float>; +using Tiles = InputTransformImplTiles<1, x, 1, 8, float>; +/*****************************************************************************/ +// 1x3 specialisations template <> -const Transform<3>::TileFn - Transform<3>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 6>, - } - } - } +const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>, }; template <> -const Transform<5>::TileFn - Transform<5>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 6>, - } - } - } +const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>, }; +/*****************************************************************************/ +/*****************************************************************************/ +// 1x5 specialisations template <> -const Transform<7>::TileFn - Transform<7>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 6>, - } - } - } +const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>, }; +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>, +}; +/*****************************************************************************/ + +/*****************************************************************************/ +// 1x7 specialisations +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>; + +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>; + +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>, +}; + +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>, +}; +/*****************************************************************************/ + + template class InputTransform<1, 3, 1, 8, float>; template class InputTransform<3, 1, 8, 1, float>; template class InputTransform<1, 5, 1, 8, float>; diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp index a9d5d52d15..4203945dd3 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_2x2_3x3_fp32.cpp @@ -29,75 +29,36 @@ namespace winograd { -using Transform = InputTransformImpl<3, 3, 4, 4, float>; - -/***************************************************************************** -* F(2x2, 3x3) implies the use of a 4x4 input tile. Such tiles can require a -* variety of padding types. For example, tiles at the top and left of an image -* can require one row or column of padding on their top and left sides if the -* padding type is SAME (where X represents a padded value): -* -* _______ _______ -* |X X X X| |X X X X| -* |X | | | . . . -* |X | | | -* |X______| |_______| -* _______ -* |X | . -* |X | . . . . -* |X | . -* |X______| -* -* For tiles near the right or bottom of the image it is more complicated. Such -* tiles might require padding by 0 or 1 rows or columns if the padding type is -* VALID or 1 or 2 rows or columns if the padding type is SAME: -* -* _______ _______ _______ _______ -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X______| |_______| |______X| |____X_X| -* _______ _______ _______ _______ -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X______| |_______| |______X| |____X_X| -* _______ _______ _______ _______ -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* _______ _______ _______ _______ -* |X | | | | X| | X X| -* |X | | | | X| | X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* Additional tiles are required for especially small input images. -* -* Build an array of the specialised methods that deal with each of the -* different padding combinations which may be required. These padding -* constraints are the space: -* -* Padding top in {0, 1} -* Padding left in {0, 1} -* Padding bottom in {0, 1, 2} -* Padding right in {0, 1, 2} -*/ -template <> -template <int pad_top, int pad_left, int pad_bottom, int pad_right> -void Transform::process_tile( +using Tiles = InputTransformImplTiles<3, 3, 4, 4, float>; + +namespace +{ + + +template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0> +void winograd_input_transform_4x4_fp32_process_tile( int n_channels, const float* const input_base, const int input_row_stride, const int input_col_stride, float* const matrix_base, - const int matrix_stride -) + const int matrix_stride, + const int _pad_top, + const int _pad_left, + const int _pad_bottom, + const int _pad_right + ) { +const int pad_top = Specialized ? PadTop : _pad_top; + const int pad_left = Specialized ? PadLeft : _pad_left; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + constexpr int inner_tile_i = 4, inner_tile_j = 4; - constexpr int cells_i = inner_tile_i - pad_bottom; - constexpr int cells_j = inner_tile_i - pad_right; + const int cells_i = inner_tile_i - pad_bottom; + const int cells_j = inner_tile_i - pad_right; + + float *outptr = matrix_base; @@ -311,81 +272,39 @@ void Transform::process_tile( } } +} // namespace (anonymous) + template <> -const Transform::TileFn Transform::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - Transform::template process_tile<0, 0, 0, 0>, // No padding - Transform::template process_tile<0, 0, 0, 1>, // Right - Transform::template process_tile<0, 0, 0, 2>, // Right - }, - { - Transform::template process_tile<0, 0, 1, 0>, // Bottom - Transform::template process_tile<0, 0, 1, 1>, // Bottom-right - Transform::template process_tile<0, 0, 1, 2>, // Bottom-right - }, - { - Transform::template process_tile<0, 0, 2, 0>, // Bottom - Transform::template process_tile<0, 0, 2, 1>, // Bottom-right - Transform::template process_tile<0, 0, 2, 2>, // Bottom-right - } - }, - { - { - Transform::template process_tile<0, 1, 0, 0>, // Left - Transform::template process_tile<0, 1, 0, 1>, // Left AND right - Transform::template process_tile<0, 1, 0, 2>, // Left AND right - }, - { - Transform::template process_tile<0, 1, 1, 0>, // Left-bottom - Transform::template process_tile<0, 1, 1, 1>, // Left, bottom AND right - Transform::template process_tile<0, 1, 1, 2>, // Left, bottom AND right - }, - { - Transform::template process_tile<0, 1, 2, 0>, // Left-bottom - Transform::template process_tile<0, 1, 2, 1>, // Left, bottom AND right - Transform::template process_tile<0, 1, 2, 2>, // Left, bottom AND right - } - }, - }, - { - { - { - Transform::template process_tile<1, 0, 0, 0>, // Top - Transform::template process_tile<1, 0, 0, 1>, // Top-right - Transform::template process_tile<1, 0, 0, 2>, // Top-right - }, - { - Transform::template process_tile<1, 0, 1, 0>, // Top AND bottom - Transform::template process_tile<1, 0, 1, 1>, // Top, bottom AND right - Transform::template process_tile<1, 0, 1, 2>, // Top, bottom AND right - }, - { - Transform::template process_tile<1, 0, 2, 0>, // Top AND bottom - Transform::template process_tile<1, 0, 2, 1>, // Top, bottom AND right - Transform::template process_tile<1, 0, 2, 2>, // Top, bottom AND right - } - }, - { - { - Transform::template process_tile<1, 1, 0, 0>, // Top-left - Transform::template process_tile<1, 1, 0, 1>, // Top, left AND right - Transform::template process_tile<1, 1, 0, 2>, // Top, left AND right - }, - { - Transform::template process_tile<1, 1, 1, 0>, // Top, left AND bottom - Transform::template process_tile<1, 1, 1, 1>, // All padded - Transform::template process_tile<1, 1, 1, 2>, // All padded - }, - { - Transform::template process_tile<1, 1, 2, 0>, // Top, left AND bottom - Transform::template process_tile<1, 1, 2, 1>, // All padded - Transform::template process_tile<1, 1, 2, 2>, // All padded - } - } - } +const Tiles::TileFn Tiles::tilefn_generic = winograd_input_transform_4x4_fp32_process_tile<false>; + +template <> +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_input_transform_4x4_fp32_process_tile<true>; + + +template <> +const Tiles::TileFn Tiles::tilefn_top_padded[n_pad_top] = { + winograd_input_transform_4x4_fp32_process_tile<true, 1, 0, 0, 0>, +}; + +template <> +const Tiles::TileFn Tiles::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_4x4_fp32_process_tile<true, 0, 1, 0, 0>, +}; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 1, 0>, + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 2, 0>, + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 3, 0>, + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 4, 0>, +}; + +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_4x4_fp32_process_tile<true, 0, 0, 0, 4>, }; template class InputTransform<3, 3, 4, 4, float>; diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp index 908613068a..893122cc45 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_6x6_fp32.cpp @@ -29,20 +29,30 @@ namespace { -template <int pad_top, int pad_left, int pad_bottom, int pad_right> +template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0> void winograd_input_transform_6x6_fp32_process_tile( int n_channels, const float* const input_base, const int input_row_stride, const int input_col_stride, float* const matrix_base, - const int matrix_stride +const int matrix_stride, + const int _pad_top, + const int _pad_left, + const int _pad_bottom, + const int _pad_right ) { - constexpr int inner_tile_rows = 6; + const int pad_top = Specialized ? PadTop : _pad_top; + const int pad_left = Specialized ? PadLeft : _pad_left; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + + constexpr int inner_tile_rows = 6; constexpr int inner_tile_cols = 6; - constexpr int cells_i = inner_tile_rows - pad_bottom; - constexpr int cells_j = inner_tile_cols - pad_right; + + const int cells_i = inner_tile_rows - pad_bottom; + const int cells_j = inner_tile_cols - pad_right; float *outptr = matrix_base; @@ -285,322 +295,80 @@ void winograd_input_transform_6x6_fp32_process_tile( namespace winograd { template <int k> -using Transform = InputTransformImpl<k, k, 6, 6, float>; +using Tiles = InputTransformImplTiles<k, k, 6, 6, float>; template <> -const Transform<3>::TileFn - Transform<3>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 0>, // No padding - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 1>, // Right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 4>, // " " - } - }, - { - { - winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 0>, // Left - winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 0, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 0>, // Bottom left - winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 1, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 0>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 2, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 0>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 3, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 0>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 1, 4, 4>, - } - } - }, - { - { - { - winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 0>, // Top - winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 1>, // Top right - winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<1, 0, 0, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 1, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 2, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 3, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 0, 4, 4>, - }, - }, - { - { - winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 0>, // Top left - winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 0, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 1, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 2, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 3, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 0>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 1>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 2>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 3>, - winograd_input_transform_6x6_fp32_process_tile<1, 1, 4, 4>, - } - } - } +const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_top_padded[n_pad_top] = { + winograd_input_transform_6x6_fp32_process_tile<true, 1, 0, 0, 0>, }; template <> -const Transform<5>::TileFn - Transform<5>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 0>, // No padding - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 1>, // Right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 0, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 1, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 2, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 3, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 0>, // Bottom - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 1>, // Bottom right - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 0, 4, 4>, // " " - } - }, - { - { - winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 0>, // Left - winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 0, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 0>, // Bottom left - winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 1, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 0>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 2, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 0>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 3, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 0>, // " " - winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 1>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 2>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 3>, - winograd_input_transform_6x6_fp32_process_tile<0, 2, 4, 4>, - } - } - }, - { - { - { - winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 0>, // Top - winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 1>, // Top right - winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 2>, // " " - winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 3>, // " " - winograd_input_transform_6x6_fp32_process_tile<2, 0, 0, 4>, // " " - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 1, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 2, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 3, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 0, 4, 4>, - }, - }, - { - { - winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 0>, // Top left - winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 0, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 1, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 2, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 3, 4>, - }, - { - winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 0>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 1>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 2>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 3>, - winograd_input_transform_6x6_fp32_process_tile<2, 2, 4, 4>, - } - } - } +const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_6x6_fp32_process_tile<true, 0, 1, 0, 0>, +}; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_bottom_padded[n_pad_bottom] = { + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>, +}; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>, +}; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_6x6_fp32_process_tile<false>; + + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_6x6_fp32_process_tile<true>; + + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_top_padded[n_pad_top] = { + winograd_input_transform_6x6_fp32_process_tile<true, 2, 0, 0, 0>, +}; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_6x6_fp32_process_tile<true, 0, 2, 0, 0>, +}; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_bottom_padded[n_pad_bottom] = { + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 1, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 2, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 3, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 4, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 5, 0>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 6, 0>, +}; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_6x6_fp32_process_tile<true, 0, 0, 0, 6>, }; template class InputTransform<3, 3, 6, 6, float>; |