diff options
Diffstat (limited to 'src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp')
-rw-r--r-- | src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp | 179 |
1 files changed, 83 insertions, 96 deletions
diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp index 042d4debbc..e66300d39a 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/input_1x8_fp32.cpp @@ -29,19 +29,30 @@ namespace { -template <int pad_top, int pad_left, int pad_bottom, int pad_right> +template <bool Specialized, int PadTop=0, int PadLeft=0, int PadBottom=0, int PadRight=0> void winograd_input_transform_1x8_fp32_process_tile( int n_channels, const float* const input_base, const int input_row_stride, const int input_col_stride, float* const matrix_base, - const int matrix_stride + const int matrix_stride, + const int _pad_top, + const int _pad_left, + const int _pad_bottom, + const int _pad_right ) { (void) input_row_stride; // No rows over which to stride + (void) _pad_top; // Never any top padding + (void) _pad_bottom; // Never any bottom padding + + // Extract padding arguments + const int pad_left = Specialized ? PadLeft : _pad_left; + const int pad_right = Specialized ? PadRight : _pad_right; + constexpr int inner_tile_cols = 8; - constexpr int cells_j = inner_tile_cols - pad_right; + const int cells_j = inner_tile_cols - pad_right; float *outptr = matrix_base; @@ -162,109 +173,85 @@ void winograd_input_transform_1x8_fp32_process_tile( namespace winograd { template <int x> -using Transform = InputTransformImpl<1, x, 1, 8, float>; +using Tiles = InputTransformImplTiles<1, x, 1, 8, float>; +/*****************************************************************************/ +// 1x3 specialisations template <> -const Transform<3>::TileFn - Transform<3>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 6>, - } - } - } +const Tiles<3>::TileFn Tiles<3>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>; + +template <> +const Tiles<3>::TileFn Tiles<3>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>, }; template <> -const Transform<5>::TileFn - Transform<5>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 2, 0, 6>, - } - } - } +const Tiles<3>::TileFn Tiles<3>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>, }; +/*****************************************************************************/ +/*****************************************************************************/ +// 1x5 specialisations template <> -const Transform<7>::TileFn - Transform<7>::tile_fns[n_pad_top][n_pad_left][n_pad_bottom][n_pad_right] = -{ - { - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 0, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 1, 0, 6>, - } - }, - { - { - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 0>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 1>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 2>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 3>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 4>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 5>, - winograd_input_transform_1x8_fp32_process_tile<0, 3, 0, 6>, - } - } - } +const Tiles<5>::TileFn Tiles<5>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>; + +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 2, 0, 0>, }; +template <> +const Tiles<5>::TileFn Tiles<5>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>, +}; +/*****************************************************************************/ + +/*****************************************************************************/ +// 1x7 specialisations +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_generic = winograd_input_transform_1x8_fp32_process_tile<false>; + +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_unpadded = winograd_input_transform_1x8_fp32_process_tile<true>; + +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_left_padded[n_pad_left] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 1, 0, 0>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 3, 0, 0>, +}; + +template <> +const Tiles<7>::TileFn Tiles<7>::tilefn_right_padded[n_pad_right] = { + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 1>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 2>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 3>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 4>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 5>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 6>, + winograd_input_transform_1x8_fp32_process_tile<true, 0, 0, 0, 7>, +}; +/*****************************************************************************/ + + template class InputTransform<1, 3, 1, 8, float>; template class InputTransform<3, 1, 8, 1, float>; template class InputTransform<1, 5, 1, 8, float>; |