From d3d97d27645efe90505a62cd48079ad06a7cf283 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Fri, 5 Oct 2018 10:59:48 +0100 Subject: COMPMID-1623: NEWinograd reduce the number of output tiles. Change-Id: I4d9240924fe483d2dd127ad6a4ae6f8066f61bd1 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/151893 Tested-by: bsgcomp Reviewed-by: Andrew Mundy Reviewed-by: Georgios Pinitas --- .../winograd/transforms/output_2_7_fp32.cpp | 55 +++++----- .../winograd/transforms/output_2x2_3x3_fp32.cpp | 84 ++++++--------- .../winograd/transforms/output_2x2_5x5_fp32.cpp | 82 ++++++--------- .../winograd/transforms/output_4_5_fp32.cpp | 59 +++++------ .../winograd/transforms/output_4x4_3x3_fp32.cpp | 117 +++++++-------------- .../winograd/transforms/output_6_3_fp32.cpp | 63 +++++------ 6 files changed, 183 insertions(+), 277 deletions(-) (limited to 'src/core/NEON/kernels/convolution') diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp index cfd2029f11..ea842a45ee 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2_7_fp32.cpp @@ -23,38 +23,33 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<1, 2, 1, 7>::OutputTransform; -using TransformTransposed = WinogradGEMM<2, 1, 7, 1>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -template <> -template <> -template -void Transform::process_tile( +template +void winograd_output_transform_2_7_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { (void) output_row_stride; - constexpr int cells_j = output_tile_cols - pad_right; + (void) _pad_bottom; + constexpr int output_tile_cols = 2; + constexpr int inner_tile_cols = 8; + + const int pad_right = Specialized ? PadRight : _pad_right; + const int cells_j = output_tile_cols - pad_right; + // Construct a map to the output cells float *outptrs[cells_j]; @@ -149,22 +144,20 @@ void Transform::process_tile( } } } +} // namespace (anonymous) -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +namespace winograd { - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - }, -}; - +using Tiles = OutputTransformImplTiles<1, 7, 1, 8, float>; template <> +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2_7_fp32_process_tile; + template <> -const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {}; +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_2_7_fp32_process_tile +}; -template struct WinogradGEMM<1, 2, 1, 7>::OutputTransform; -template struct WinogradGEMM<2, 1, 7, 1>::OutputTransform; +template class OutputTransform<1, 7, 1, 8, float>; +template class OutputTransform<7, 1, 8, 1, float>; } // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp index 3b3cda0aa9..597b074026 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_3x3_fp32.cpp @@ -23,59 +23,34 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 3, 3>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(shape.n_rows, 2); - const int tile_N = iceildiv(shape.n_cols, 2); - return 24 * tile_M * tile_N * shape.n_channels; -} -/* F(2x2, 3x3) constructs 2x2 output tiles from a 3x3 convolution. Since we use - * enough tiles to cover the output space each output tile may contain 0 or 1 - * padded values to the right and bottom columns or rows of the tile, e.g.: - * - * ___ ___ - * | | | X| - * |___| |__X| - * - * ___ ___ - * | | | X| - * |X_X| |X_X| - * - * - * We provide a specialised output transform for each of these instances. - * Consequently we below construct an array of the various padding options, the - * array contains pointers to the specific implementations. - */ -template <> -template <> -template -void Transform::process_tile( +template +void winograd_output_transform_2x2_3x3_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { - constexpr int cells_i = 2 - pad_bottom; - constexpr int cells_j = 2 - pad_right; + constexpr int OutputTileRows = 2, OutputTileCols = 2; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + + const int cells_i = OutputTileRows - pad_bottom; + const int cells_j = OutputTileCols - pad_right; // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; + float *outptrs[OutputTileRows][OutputTileCols]; for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) @@ -373,19 +348,28 @@ void Transform::process_tile( } } +} // namespace (anonymous) + +namespace winograd +{ +using Tiles = OutputTransformImplTiles<3, 3, 4, 4, float>; + template <> +const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_3x3_fp32_process_tile; + template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, // No padding - Transform::template process_tile<0, 1>, // Right padding - }, - { - Transform::template process_tile<1, 0>, // Bottom padding - Transform::template process_tile<1, 1>, // Bottom and right padding - } +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_3x3_fp32_process_tile; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_output_transform_2x2_3x3_fp32_process_tile }; -template struct WinogradGEMM<2, 2, 3, 3>::OutputTransform; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_2x2_3x3_fp32_process_tile +}; + +template class OutputTransform<3, 3, 4, 4, float>; } // namespace winograd + diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp index cafce9549d..60d7181d97 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_2x2_5x5_fp32.cpp @@ -23,57 +23,34 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<2, 2, 5, 5>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -/* F(2x2, 5x5) constructs 2x2 output tiles from a 5x5 convolution. Since we use - * enough tiles to cover the output space each output tile may contain 0 or 1 - * padded values to the right and bottom columns or rows of the tile, e.g.: - * - * ___ ___ - * | | | X| - * |___| |__X| - * - * ___ ___ - * | | | X| - * |X_X| |X_X| - * - * - * We provide a specialised output transform for each of these instances. - * Consequently we below construct an array of the various padding options, the - * array contains pointers to the specific implementations. - */ -template <> -template <> -template -void Transform::process_tile( +template +void winograd_output_transform_2x2_5x5_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { - constexpr int cells_i = 2 - pad_bottom; - constexpr int cells_j = 2 - pad_right; + constexpr int OutputTileRows = 2, OutputTileCols = 2; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + + const int cells_i = 2 - pad_bottom; + const int cells_j = 2 - pad_right; // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; + float *outptrs[OutputTileRows][OutputTileCols]; for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) @@ -365,19 +342,28 @@ void Transform::process_tile( } } +} // namespace (anonymous) + +namespace winograd +{ +using Tiles = OutputTransformImplTiles<5, 5, 6, 6, float>; + template <> +const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_2x2_5x5_fp32_process_tile; + template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, // No padding - Transform::template process_tile<0, 1>, // Right padding - }, - { - Transform::template process_tile<1, 0>, // Bottom padding - Transform::template process_tile<1, 1>, // Bottom and right padding - } +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_2x2_5x5_fp32_process_tile; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_output_transform_2x2_5x5_fp32_process_tile }; -template struct WinogradGEMM<2, 2, 5, 5>::OutputTransform; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_2x2_5x5_fp32_process_tile +}; + +template class OutputTransform<5, 5, 6, 6, float>; } // namespace winograd + diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp index 2417f527bf..911759b128 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4_5_fp32.cpp @@ -23,38 +23,32 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<1, 4, 1, 5>::OutputTransform; -using TransformTransposed = WinogradGEMM<4, 1, 5, 1>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -template <> -template <> -template -void Transform::process_tile( +template +void winograd_output_transform_4_5_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { (void) output_row_stride; - constexpr int cells_j = output_tile_cols - pad_right; + (void) _pad_bottom; + constexpr int output_tile_cols = 4; + constexpr int inner_tile_cols = 8; + + const int pad_right = Specialized ? PadRight : _pad_right; + const int cells_j = output_tile_cols - pad_right; // Construct a map to the output cells float *outptrs[cells_j]; @@ -156,23 +150,22 @@ void Transform::process_tile( } } -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +} // namespace (anonymous) + +namespace winograd { - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - }, -}; +using Tiles = OutputTransformImplTiles<1, 5, 1, 8, float>; template <> -template <> -const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {}; +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4_5_fp32_process_tile; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_4_5_fp32_process_tile, + winograd_output_transform_4_5_fp32_process_tile, + winograd_output_transform_4_5_fp32_process_tile +}; -template struct WinogradGEMM<1, 4, 1, 5>::OutputTransform; -template struct WinogradGEMM<4, 1, 5, 1>::OutputTransform; +template class OutputTransform<1, 5, 1, 8, float>; +template class OutputTransform<5, 1, 8, 1, float>; } // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp index cd3bdef0d2..15cc04b352 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_4x4_3x3_fp32.cpp @@ -23,73 +23,34 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd +namespace { -using Transform = WinogradGEMM<4, 4, 3, 3>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) -{ - // NOTE: Cost in FLOPs rather than instructions or uops. - const int tile_M = iceildiv(shape.n_rows, 4); - const int tile_N = iceildiv(shape.n_cols, 4); - return 170 * tile_M * tile_N * shape.n_channels; -} - -/* F(4x4, 3x3) constructs 4x4 output tiles from a 3x3 convolution. Since we use - * enough tiles to cover the output space each output tile may contain up to 3 - * padded values to the right and bottom columns or rows of the tile, e.g.: -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |_______| |______X| |____X_X| |__X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* | | | X| | X X| | X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* ________ ________ ________ ________ -* | | | X| | X X| | X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X X X X| |X X X X| |X X X X| |X X X X| -* |X_X_X_X| |X_X_X_X| |X_X_X_X| |X_X_X_X| -* -* -* We provide a specialised output transform for each of these instances. -*/ -template <> -template <> -template -void Transform::process_tile( +template +void winograd_output_transform_4x4_3x3_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { - constexpr int cells_i = 4 - pad_bottom; - constexpr int cells_j = 4 - pad_right; + const int pad_bottom = Specialized ? PadBottom : _pad_bottom; + const int pad_right = Specialized ? PadRight : _pad_right; + constexpr int TileRows = 4, TileCols = 4; + + const int cells_i = TileRows - pad_bottom; + const int cells_j = TileCols - pad_right; // Construct a map to the output cells - float *outptrs[cells_i][cells_j]; + float *outptrs[TileRows][TileCols]; for (int i = 0; i < cells_i; i++) { for (int j = 0; j < cells_j; j++) @@ -437,35 +398,31 @@ void Transform::process_tile( } } +} // namespace (anonymous) + +namespace winograd +{ +using Tiles = OutputTransformImplTiles<3, 3, 6, 6, float>; + template <> +const Tiles::TileFn Tiles::tilefn_generic = winograd_output_transform_4x4_3x3_fp32_process_tile; + template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = -{ - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - }, - { - Transform::template process_tile<1, 0>, - Transform::template process_tile<1, 1>, - Transform::template process_tile<1, 2>, - Transform::template process_tile<1, 3>, - }, - { - Transform::template process_tile<2, 0>, - Transform::template process_tile<2, 1>, - Transform::template process_tile<2, 2>, - Transform::template process_tile<2, 3>, - }, - { - Transform::template process_tile<3, 0>, - Transform::template process_tile<3, 1>, - Transform::template process_tile<3, 2>, - Transform::template process_tile<3, 3>, - } +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_4x4_3x3_fp32_process_tile; + +template <> +const Tiles::TileFn Tiles::tilefn_bottom_padded[n_pad_bottom] = { + winograd_output_transform_4x4_3x3_fp32_process_tile, + winograd_output_transform_4x4_3x3_fp32_process_tile, + winograd_output_transform_4x4_3x3_fp32_process_tile, +}; + +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_4x4_3x3_fp32_process_tile, + winograd_output_transform_4x4_3x3_fp32_process_tile, + winograd_output_transform_4x4_3x3_fp32_process_tile, }; -template struct WinogradGEMM<4, 4, 3, 3>::OutputTransform; +template class OutputTransform<3, 3, 6, 6, float>; } // namespace winograd diff --git a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp index 16667ccdb6..58bed71a47 100644 --- a/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp +++ b/src/core/NEON/kernels/convolution/winograd/transforms/output_6_3_fp32.cpp @@ -23,38 +23,32 @@ */ #include "arm_compute/core/NEON/kernels/convolution/winograd/transforms/output.hpp" -#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_output_transform.hpp" #include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp" -namespace winograd -{ - -using Transform = WinogradGEMM<1, 6, 1, 3>::OutputTransform; -using TransformTransposed = WinogradGEMM<6, 1, 3, 1>::OutputTransform; - -template <> -template <> -int Transform::ops_performed(const Tensor4DShape &shape) +namespace { - (void) shape; - return 0; // TODO -} -template <> -template <> -template -void Transform::process_tile( +template +void winograd_output_transform_6_3_fp32_process_tile( const int n_channels, const float* const matrix_base, const int matrix_stride, const float* const biases, float* const output, const int output_row_stride, - const int output_col_stride + const int output_col_stride, + const int _pad_bottom, + const int _pad_right ) { (void) output_row_stride; - constexpr int cells_j = output_tile_cols - pad_right; + (void) _pad_bottom; + constexpr int output_tile_cols = 6; + constexpr int inner_tile_cols = 8; + + const int pad_right = Specialized ? PadRight : _pad_right; + const int cells_j = output_tile_cols - pad_right; // Construct a map to the output cells float *outptrs[cells_j]; @@ -162,25 +156,24 @@ void Transform::process_tile( } } -template <> -template <> -const Transform::TileFn Transform::tile_fns[max_pad_bottom][max_pad_right] = +} // namespace (anonymous) + +namespace winograd { - { - Transform::template process_tile<0, 0>, - Transform::template process_tile<0, 1>, - Transform::template process_tile<0, 2>, - Transform::template process_tile<0, 3>, - Transform::template process_tile<0, 4>, - Transform::template process_tile<0, 5>, - }, -}; +using Tiles = OutputTransformImplTiles<1, 3, 1, 8, float>; template <> -template <> -const TransformTransposed::TileFn TransformTransposed::tile_fns[max_pad_bottom][max_pad_right] = {}; +const Tiles::TileFn Tiles::tilefn_unpadded = winograd_output_transform_6_3_fp32_process_tile; +template <> +const Tiles::TileFn Tiles::tilefn_right_padded[n_pad_right] = { + winograd_output_transform_6_3_fp32_process_tile, + winograd_output_transform_6_3_fp32_process_tile, + winograd_output_transform_6_3_fp32_process_tile, + winograd_output_transform_6_3_fp32_process_tile, + winograd_output_transform_6_3_fp32_process_tile, +}; -template struct WinogradGEMM<1, 6, 1, 3>::OutputTransform; -template struct WinogradGEMM<6, 1, 3, 1>::OutputTransform; +template class OutputTransform<1, 3, 1, 8, float>; +template class OutputTransform<3, 1, 8, 1, float>; } // namespace winograd -- cgit v1.2.1